import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# display all the columns in the dataset in the output
pd.set_option('display.max_columns', 57)
df = pd.read_csv("Obfuscated-MalMem2022.csv")
df.head()
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nport | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Benign | 45 | 17 | 10.555556 | 0 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | 0 | 670 | 3161 | 46 | 716 | 887 | 104 | 671 | 125 | 184 | 257 | 53 | 95 | 53 | 0.030372 | 0.054441 | 0.030372 | 5 | 21 | 30 | 1.250000 | 2 | 0 | 3 | 2 | 7 | 4 | 9 | 0.042553 | 0.0 | 0.063830 | 0.042553 | 0.148936 | 0.085106 | 0.191489 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 121 | 87 | 0 | 8 | Benign |
| 1 | Benign | 47 | 19 | 11.531915 | 0 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | 0 | 840 | 3761 | 51 | 1011 | 1030 | 117 | 766 | 148 | 337 | 394 | 77 | 123 | 77 | 0.036167 | 0.057774 | 0.036167 | 12 | 77 | 72 | 1.714286 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.085106 | 0.042553 | 0.127660 | 138 | 392 | 222 | 26 | 24 | 118 | 0 | 122 | 87 | 0 | 8 | Benign |
| 2 | Benign | 40 | 14 | 14.725000 | 0 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | 0 | 1050 | 3996 | 45 | 784 | 1241 | 100 | 645 | 138 | 369 | 338 | 51 | 89 | 51 | 0.026114 | 0.045571 | 0.026114 | 5 | 6 | 30 | 1.250000 | 0 | 0 | 0 | 0 | 4 | 2 | 5 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.100000 | 0.050000 | 0.125000 | 137 | 395 | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 3 | Benign | 32 | 13 | 13.500000 | 0 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | 0 | 630 | 2961 | 36 | 654 | 792 | 83 | 567 | 127 | 186 | 242 | 31 | 62 | 31 | 0.021483 | 0.042966 | 0.021483 | 2 | 2 | 12 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.125000 | 0.062500 | 0.187500 | 138 | 395 | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign |
| 4 | Benign | 42 | 16 | 11.452381 | 0 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | 0 | 908 | 3834 | 45 | 1252 | 942 | 103 | 825 | 135 | 375 | 429 | 102 | 143 | 102 | 0.047820 | 0.067042 | 0.047820 | 12 | 77 | 72 | 2.000000 | 4 | 0 | 4 | 4 | 8 | 6 | 10 | 0.086957 | 0.0 | 0.086957 | 0.086957 | 0.173913 | 0.130435 | 0.217391 | 138 | 392 | 222 | 26 | 24 | 118 | 0 | 124 | 87 | 0 | 8 | Benign |
In this section, I will assess the quality and tidiness of the data. I will also clean the data.
# Check for missing values
df.isnull().sum()
Category 0 pslist.nproc 0 pslist.nppid 0 pslist.avg_threads 0 pslist.nprocs64bit 0 pslist.avg_handlers 0 dlllist.ndlls 0 dlllist.avg_dlls_per_proc 0 handles.nhandles 0 handles.avg_handles_per_proc 0 handles.nport 0 handles.nfile 0 handles.nevent 0 handles.ndesktop 0 handles.nkey 0 handles.nthread 0 handles.ndirectory 0 handles.nsemaphore 0 handles.ntimer 0 handles.nsection 0 handles.nmutant 0 ldrmodules.not_in_load 0 ldrmodules.not_in_init 0 ldrmodules.not_in_mem 0 ldrmodules.not_in_load_avg 0 ldrmodules.not_in_init_avg 0 ldrmodules.not_in_mem_avg 0 malfind.ninjections 0 malfind.commitCharge 0 malfind.protection 0 malfind.uniqueInjections 0 psxview.not_in_pslist 0 psxview.not_in_eprocess_pool 0 psxview.not_in_ethread_pool 0 psxview.not_in_pspcid_list 0 psxview.not_in_csrss_handles 0 psxview.not_in_session 0 psxview.not_in_deskthrd 0 psxview.not_in_pslist_false_avg 0 psxview.not_in_eprocess_pool_false_avg 0 psxview.not_in_ethread_pool_false_avg 0 psxview.not_in_pspcid_list_false_avg 0 psxview.not_in_csrss_handles_false_avg 0 psxview.not_in_session_false_avg 0 psxview.not_in_deskthrd_false_avg 0 modules.nmodules 0 svcscan.nservices 0 svcscan.kernel_drivers 0 svcscan.fs_drivers 0 svcscan.process_services 0 svcscan.shared_process_services 0 svcscan.interactive_process_services 0 svcscan.nactive 0 callbacks.ncallbacks 0 callbacks.nanonymous 0 callbacks.ngeneric 0 Class 0 dtype: int64
There are no missing values in the data.
# check the data type of the variables
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 58596 entries, 0 to 58595 Data columns (total 57 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Category 58596 non-null object 1 pslist.nproc 58596 non-null int64 2 pslist.nppid 58596 non-null int64 3 pslist.avg_threads 58596 non-null float64 4 pslist.nprocs64bit 58596 non-null int64 5 pslist.avg_handlers 58596 non-null float64 6 dlllist.ndlls 58596 non-null int64 7 dlllist.avg_dlls_per_proc 58596 non-null float64 8 handles.nhandles 58596 non-null int64 9 handles.avg_handles_per_proc 58596 non-null float64 10 handles.nport 58596 non-null int64 11 handles.nfile 58596 non-null int64 12 handles.nevent 58596 non-null int64 13 handles.ndesktop 58596 non-null int64 14 handles.nkey 58596 non-null int64 15 handles.nthread 58596 non-null int64 16 handles.ndirectory 58596 non-null int64 17 handles.nsemaphore 58596 non-null int64 18 handles.ntimer 58596 non-null int64 19 handles.nsection 58596 non-null int64 20 handles.nmutant 58596 non-null int64 21 ldrmodules.not_in_load 58596 non-null int64 22 ldrmodules.not_in_init 58596 non-null int64 23 ldrmodules.not_in_mem 58596 non-null int64 24 ldrmodules.not_in_load_avg 58596 non-null float64 25 ldrmodules.not_in_init_avg 58596 non-null float64 26 ldrmodules.not_in_mem_avg 58596 non-null float64 27 malfind.ninjections 58596 non-null int64 28 malfind.commitCharge 58596 non-null int64 29 malfind.protection 58596 non-null int64 30 malfind.uniqueInjections 58596 non-null float64 31 psxview.not_in_pslist 58596 non-null int64 32 psxview.not_in_eprocess_pool 58596 non-null int64 33 psxview.not_in_ethread_pool 58596 non-null int64 34 psxview.not_in_pspcid_list 58596 non-null int64 35 psxview.not_in_csrss_handles 58596 non-null int64 36 psxview.not_in_session 58596 non-null int64 37 psxview.not_in_deskthrd 58596 non-null int64 38 psxview.not_in_pslist_false_avg 58596 non-null float64 39 psxview.not_in_eprocess_pool_false_avg 58596 non-null float64 40 psxview.not_in_ethread_pool_false_avg 58596 non-null float64 41 psxview.not_in_pspcid_list_false_avg 58596 non-null float64 42 psxview.not_in_csrss_handles_false_avg 58596 non-null float64 43 psxview.not_in_session_false_avg 58596 non-null float64 44 psxview.not_in_deskthrd_false_avg 58596 non-null float64 45 modules.nmodules 58596 non-null int64 46 svcscan.nservices 58596 non-null int64 47 svcscan.kernel_drivers 58596 non-null int64 48 svcscan.fs_drivers 58596 non-null int64 49 svcscan.process_services 58596 non-null int64 50 svcscan.shared_process_services 58596 non-null int64 51 svcscan.interactive_process_services 58596 non-null int64 52 svcscan.nactive 58596 non-null int64 53 callbacks.ncallbacks 58596 non-null int64 54 callbacks.nanonymous 58596 non-null int64 55 callbacks.ngeneric 58596 non-null int64 56 Class 58596 non-null object dtypes: float64(15), int64(40), object(2) memory usage: 25.5+ MB
# check for duplicates duplicates
df.duplicated().sum()
534
# preview dataset
df.sample(10)
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nport | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3537 | Benign | 41 | 17 | 11.198938 | 0 | 264.622031 | 1859 | 45.341463 | 10850 | 271.262582 | 0 | 859 | 3702 | 45 | 941 | 898 | 102 | 740 | 131 | 339 | 367 | 80 | 118 | 80 | 0.042395 | 0.062533 | 0.042395 | 5 | 7 | 30 | 1.666667 | 14 | 0 | 15 | 14 | 19 | 16 | 21 | 0.254545 | 0.0 | 0.272727 | 0.254545 | 0.345455 | 0.290909 | 0.381818 | 138 | 392 | 222 | 26 | 24 | 118 | 0 | 126 | 87 | 0 | 8 | Benign |
| 44764 | Spyware-Transponder-0f4575a4973db1ad7190888424... | 43 | 17 | 9.674419 | 0 | 208.255814 | 1660 | 38.604651 | 8956 | 213.238095 | 0 | 866 | 3002 | 45 | 689 | 756 | 106 | 662 | 129 | 191 | 284 | 47 | 88 | 47 | 0.027680 | 0.051826 | 0.027680 | 61 | 18559 | 366 | 12.200000 | 2 | 0 | 3 | 2 | 7 | 4 | 9 | 0.044444 | 0.0 | 0.066667 | 0.044444 | 0.155556 | 0.088889 | 0.200000 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 123 | 87 | 0 | 8 | Malware |
| 31961 | Ransomware-Maze-1c3ffbe26074e9fd9edb0621b3b124... | 43 | 17 | 9.348837 | 0 | 199.279070 | 1634 | 38.000000 | 8569 | 199.279070 | 0 | 662 | 2986 | 46 | 691 | 717 | 107 | 615 | 126 | 178 | 265 | 46 | 88 | 46 | 0.027512 | 0.052632 | 0.027512 | 4 | 4 | 24 | 1.000000 | 2 | 0 | 2 | 2 | 6 | 4 | 8 | 0.044444 | 0.0 | 0.044444 | 0.044444 | 0.133333 | 0.088889 | 0.177778 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 122 | 87 | 0 | 8 | Malware |
| 57735 | Ransomware-Shade-59a3d9fdeca28c76d2baa3c0de2de... | 43 | 17 | 9.372093 | 0 | 198.604651 | 1584 | 36.837209 | 8071 | 237.382353 | 0 | 625 | 2844 | 34 | 632 | 711 | 91 | 609 | 115 | 180 | 252 | 41 | 78 | 41 | 0.037477 | 0.071298 | 0.037477 | 2 | 2 | 12 | 1.000000 | 2 | 0 | 2 | 2 | 6 | 4 | 12 | 0.044444 | 0.0 | 0.044444 | 0.044444 | 0.133333 | 0.088889 | 0.266667 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 120 | 55 | 0 | 8 | Malware |
| 23099 | Benign | 40 | 12 | 13.580798 | 0 | 306.253991 | 2090 | 52.257605 | 12250 | 306.253991 | 0 | 1066 | 4299 | 44 | 897 | 1180 | 101 | 757 | 140 | 400 | 398 | 80 | 117 | 80 | 0.038254 | 0.055946 | 0.038254 | 5 | 6 | 30 | 1.250000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.100000 | 0.050000 | 0.150000 | 138 | 395 | 222 | 26 | 27 | 118 | 0 | 123 | 88 | 0 | 8 | Benign |
| 46097 | Trojan-Emotet-0f274487265796754195c2fc1510e1c0... | 37 | 15 | 10.081081 | 0 | 214.162162 | 1466 | 39.621622 | 7924 | 214.162162 | 0 | 613 | 2797 | 40 | 661 | 644 | 91 | 630 | 113 | 153 | 223 | 45 | 81 | 45 | 0.029801 | 0.053642 | 0.029801 | 3 | 3 | 18 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 5 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.108108 | 0.054054 | 0.135135 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 118 | 88 | 0 | 8 | Malware |
| 10675 | Benign | 42 | 12 | 13.116246 | 0 | 298.211082 | 2163 | 51.500000 | 12525 | 305.508913 | 0 | 1077 | 4380 | 45 | 922 | 1203 | 104 | 778 | 144 | 422 | 396 | 81 | 118 | 81 | 0.037939 | 0.055269 | 0.037939 | 5 | 6 | 30 | 1.250000 | 0 | 0 | 1 | 0 | 5 | 2 | 7 | 0.000000 | 0.0 | 0.023810 | 0.000000 | 0.119048 | 0.047619 | 0.166667 | 138 | 395 | 222 | 26 | 27 | 118 | 0 | 125 | 88 | 0 | 8 | Benign |
| 56326 | Ransomware-Ako-0c19a499353c91eac2ffa6302c586f4... | 37 | 15 | 10.054054 | 0 | 214.810811 | 1444 | 39.027027 | 7948 | 214.810811 | 0 | 621 | 2812 | 40 | 665 | 658 | 92 | 596 | 113 | 160 | 230 | 43 | 79 | 43 | 0.029392 | 0.053999 | 0.029392 | 3 | 3 | 18 | 1.000000 | 4 | 0 | 4 | 4 | 8 | 6 | 10 | 0.097561 | 0.0 | 0.097561 | 0.097561 | 0.195122 | 0.146341 | 0.243902 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 118 | 86 | 0 | 8 | Malware |
| 55366 | Ransomware-Ako-0b0d0e4701b0c13c3ea3f1da137940d... | 40 | 16 | 9.850000 | 0 | 213.575000 | 1548 | 38.700000 | 8544 | 219.076923 | 0 | 848 | 2880 | 42 | 692 | 705 | 97 | 652 | 119 | 171 | 252 | 44 | 82 | 44 | 0.028133 | 0.052430 | 0.028133 | 61 | 18559 | 366 | 12.200000 | 3 | 0 | 4 | 3 | 8 | 5 | 10 | 0.069767 | 0.0 | 0.093023 | 0.069767 | 0.186047 | 0.116279 | 0.232558 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 119 | 86 | 0 | 8 | Malware |
| 29582 | Ransomware-Ako-0a942b7066fbae247a767612dd723c4... | 40 | 16 | 9.875000 | 0 | 208.775000 | 1557 | 38.925000 | 8351 | 208.775000 | 0 | 642 | 2925 | 43 | 665 | 708 | 101 | 606 | 123 | 178 | 262 | 45 | 84 | 45 | 0.028143 | 0.052533 | 0.028143 | 3 | 3 | 18 | 1.000000 | 2 | 0 | 2 | 2 | 6 | 4 | 8 | 0.047619 | 0.0 | 0.047619 | 0.047619 | 0.142857 | 0.095238 | 0.190476 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 122 | 86 | 0 | 8 | Malware |
# check the Category variable
df.Category.unique()
array(['Benign',
'Ransomware-Ako-00a2c6bab1e53f679cdd4fdc772cd291928c109b9b747652639a1700d844f719-1.raw',
'Ransomware-Ako-00a2c6bab1e53f679cdd4fdc772cd291928c109b9b747652639a1700d844f719-10.raw',
...,
'Ransomware-Shade-faddeea111a25da4d0888f3044ae9555f0c55517f6226b30e521008fceda6bbf-7.raw',
'Ransomware-Shade-f866c086af2e1d8ebaa6f2c8631578896768285120b57ddd43453bdebb217ab1-10.raw',
'Ransomware-Shade-955d9af38346c1755527bd196668edfad6d3f001d217b04d2380eb99e0760585-8.raw'],
dtype=object)
# check the Class variable
df.Class.unique()
array(['Benign', 'Malware'], dtype=object)
# check the numeric columns
df.describe()
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nport | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 58596.000000 | 58596.000000 | 58596.000000 | 58596.0 | 58596.000000 | 58596.000000 | 58596.000000 | 5.859600e+04 | 58596.000000 | 58596.0 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.0 | 58596.000000 | 58596.000000 | 58596.000000 | 58596.000000 |
| mean | 41.394771 | 14.713837 | 11.341655 | 0.0 | 247.509819 | 1810.805447 | 43.707806 | 1.025858e+04 | 249.560958 | 0.0 | 899.119513 | 3572.409960 | 44.529166 | 774.280668 | 928.510086 | 102.398338 | 683.339324 | 130.327855 | 290.127466 | 312.588829 | 60.830347 | 99.946413 | 60.832599 | 0.033170 | 0.055223 | 0.033171 | 7.010274 | 969.199229 | 42.282408 | 1.733699 | 1.875845 | 0.002082 | 2.273500 | 1.879463 | 6.276572 | 3.875589 | 8.256605 | 0.040991 | 0.000080 | 0.047745 | 0.041078 | 0.141784 | 0.087964 | 0.187701 | 137.961465 | 391.347549 | 221.406581 | 25.996245 | 25.063417 | 116.879514 | 0.0 | 121.995546 | 86.905659 | 0.000853 | 7.999881 |
| std | 5.777249 | 2.656748 | 1.588231 | 0.0 | 111.857790 | 329.782639 | 5.742023 | 4.866864e+03 | 145.999866 | 0.0 | 3432.351200 | 805.460522 | 5.161254 | 150.407075 | 237.817566 | 9.782695 | 94.531078 | 14.965266 | 144.278832 | 73.173189 | 18.761422 | 21.438482 | 18.759947 | 0.009263 | 0.010112 | 0.009266 | 15.390647 | 6041.620916 | 92.337061 | 2.741343 | 2.995955 | 0.045582 | 4.621418 | 3.017659 | 4.622047 | 2.995970 | 4.736886 | 0.057563 | 0.001269 | 0.066907 | 0.058216 | 0.061438 | 0.055040 | 0.061534 | 0.198251 | 4.529704 | 1.991087 | 0.170790 | 1.529628 | 1.550401 | 0.0 | 2.822858 | 3.134117 | 0.029199 | 0.010929 |
| min | 21.000000 | 8.000000 | 1.650000 | 0.0 | 34.962500 | 670.000000 | 7.333333 | 3.514000e+03 | 71.139241 | 0.0 | 266.000000 | 966.000000 | 22.000000 | 284.000000 | 388.000000 | 57.000000 | 296.000000 | 69.000000 | 50.000000 | 118.000000 | 6.000000 | 16.000000 | 6.000000 | 0.016176 | 0.040526 | 0.016176 | 1.000000 | 1.000000 | 6.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.025806 | 0.008333 | 0.043750 | 126.000000 | 94.000000 | 55.000000 | 6.000000 | 7.000000 | 26.000000 | 0.0 | 30.000000 | 50.000000 | 0.000000 | 7.000000 |
| 25% | 40.000000 | 12.000000 | 9.972973 | 0.0 | 208.725000 | 1556.000000 | 38.833333 | 8.393000e+03 | 209.648228 | 0.0 | 646.000000 | 2923.000000 | 43.000000 | 675.000000 | 708.000000 | 99.000000 | 614.000000 | 120.000000 | 177.000000 | 258.000000 | 46.000000 | 85.000000 | 46.000000 | 0.028846 | 0.052397 | 0.028846 | 3.000000 | 3.000000 | 18.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.100000 | 0.048780 | 0.146341 | 138.000000 | 389.000000 | 221.000000 | 26.000000 | 24.000000 | 116.000000 | 0.0 | 121.000000 | 87.000000 | 0.000000 | 8.000000 |
| 50% | 41.000000 | 15.000000 | 11.000000 | 0.0 | 243.963710 | 1735.000000 | 42.781524 | 9.287500e+03 | 247.208951 | 0.0 | 839.000000 | 3151.000000 | 45.000000 | 753.000000 | 848.000000 | 103.000000 | 684.000000 | 131.000000 | 224.000000 | 289.000000 | 57.000000 | 97.000000 | 57.000000 | 0.031361 | 0.054036 | 0.031361 | 4.000000 | 4.000000 | 24.000000 | 1.250000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 5.000000 | 3.000000 | 7.000000 | 0.021739 | 0.000000 | 0.023256 | 0.021739 | 0.119048 | 0.066667 | 0.166465 | 138.000000 | 389.000000 | 221.000000 | 26.000000 | 24.000000 | 116.000000 | 0.0 | 122.000000 | 87.000000 | 0.000000 | 8.000000 |
| 75% | 43.000000 | 16.000000 | 12.861955 | 0.0 | 289.974322 | 2087.000000 | 49.605280 | 1.219300e+04 | 291.355050 | 0.0 | 1080.000000 | 4321.000000 | 46.000000 | 859.000000 | 1169.000000 | 107.000000 | 750.000000 | 142.000000 | 415.000000 | 366.000000 | 74.000000 | 115.000000 | 74.000000 | 0.036430 | 0.056010 | 0.036430 | 5.000000 | 6.000000 | 30.000000 | 1.333333 | 3.000000 | 0.000000 | 3.000000 | 3.000000 | 7.000000 | 5.000000 | 9.000000 | 0.068182 | 0.000000 | 0.069767 | 0.068182 | 0.166180 | 0.113636 | 0.211691 | 138.000000 | 395.000000 | 222.000000 | 26.000000 | 27.000000 | 118.000000 | 0.0 | 123.000000 | 88.000000 | 0.000000 | 8.000000 |
| max | 240.000000 | 72.000000 | 16.818182 | 0.0 | 24845.951220 | 3443.000000 | 53.170732 | 1.047310e+06 | 33784.193550 | 0.0 | 807008.000000 | 7892.000000 | 159.000000 | 2668.000000 | 5637.000000 | 498.000000 | 4268.000000 | 382.000000 | 14687.000000 | 583.000000 | 240.000000 | 264.000000 | 240.000000 | 0.531120 | 0.585062 | 0.531120 | 627.000000 | 220850.000000 | 3762.000000 | 90.666667 | 43.000000 | 1.000000 | 201.000000 | 43.000000 | 205.000000 | 45.000000 | 207.000000 | 0.551282 | 0.043478 | 0.837500 | 1.000000 | 0.854167 | 0.576923 | 0.862500 | 138.000000 | 395.000000 | 222.000000 | 26.000000 | 27.000000 | 118.000000 | 0.0 | 129.000000 | 89.000000 | 1.000000 | 8.000000 |
# Check the 'pslist.nprocs64bit' variable
df['pslist.nprocs64bit'].unique()
array([0], dtype=int64)
# Check the svcscan.interactive_process_services variable
df['svcscan.interactive_process_services'].unique()
array([0], dtype=int64)
# check the handles.nport variable
df['handles.nport'].unique()
array([0], dtype=int64)
# check the svcscan.interactive_process_services variable
df['svcscan.interactive_process_services'].unique()
array([0], dtype=int64)
Class and Category variables are of the object data type.Category variable have strange characters like -00a2c6bab1e53f679cdd4fdc772cd291928c109b9b747652639a1700d844f719-1.raw.Spyware-Gator and Ransomware-Akopslist.nprocs64bit, svcscan.interactive_process_services, handles.nport, svcscan.interactive_process_services variables are zeros.pslist.nppid, pslist.avg_handlers, handles.avg_handles_per_proc, handles.nfile, handles.ndesktop, handles.nkey, handles.nthread, handles.nsemaphore, handles.nsection, malfind.ninjections, malfind.commitCharge, malfind.protection, malfind.uniqueInjections, psxview.not_in_pslist, psxview.not_in_ethread_pool, psxview.not_in_pspcid_list, psxview.not_in_csrss_handles, psxview.not_in_session, psxview.not_in_deskthrdCategory variable.Categorical column. Create a new column called type.Class and Category to categorical data type.pslist.nprocs64bit, svcscan.interactive_process_services, handles.nport, svcscan.interactive_process_services variables.Remove duplicate records
print("Shape of dataset before dropping duplicates", df.shape)
df.drop_duplicates(inplace=True)
print("Shape of dataset after dropping duplicates", df.shape)
Shape of dataset before dropping duplicates (58596, 57) Shape of dataset after dropping duplicates (58062, 57)
Remove strange characters from the Category column
df.Category.unique().shape
(28346,)
There are 28346 unique values in the Category column. Let's fix this.
df.Category.sample(10)
52976 Trojan-Zeus-01ded237eec204285a7463cf4e311e1db1... 50281 Trojan-Refroso-a2f900571e93136c0dc4c63222cbca5... 49204 Trojan-Refroso-256ff792cb3d621aa5947ed3799b52a... 4631 Benign 23955 Benign 26743 Benign 3693 Benign 39551 Spyware-Gator-0a86de744235ed74b9dc46b74c70103d... 2793 Benign 890 Benign Name: Category, dtype: object
df['category_new'] = df.Category.apply(lambda x: x.split('-')[0])
df['category_new'].sample(10)
30307 Ransomware 25170 Benign 39731 Spyware 43367 Spyware 36005 Spyware 23840 Benign 15238 Benign 12162 Benign 47934 Trojan 37760 Spyware Name: category_new, dtype: object
Create a new column for type of attack
def attack_type(value):
'''Function to get the different attack type'''
if value == 'Benign':
output = 'None'
else:
output = value.split('-')[1]
return output
df['type'] = df.Category.apply(attack_type)
df['type'].sample(10)
20063 None 36293 180solutions 56699 Ako 40706 Gator 53087 Zeus 26444 None 38919 CWS 42483 TIBS 37213 180solutions 7452 None Name: type, dtype: object
# check the unique values in the type column
print(df['type'].unique())
# check the unique categories in the new column
print(df['category_new'].unique())
df.head()
['None' 'Ako' 'Conti' 'Maze' 'Pysa' 'Shade' '180solutions' 'CWS' 'Gator' 'TIBS' 'Transponder' 'Emotet' 'Reconyc' 'Refroso' 'Scar' 'Zeus'] ['Benign' 'Ransomware' 'Spyware' 'Trojan']
| Category | pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.nprocs64bit | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nport | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | ... | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.interactive_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | category_new | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Benign | 45 | 17 | 10.555556 | 0 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | 0 | 670 | 3161 | 46 | 716 | 887 | 104 | 671 | 125 | 184 | 257 | 53 | 95 | 53 | 0.030372 | 0.054441 | 0.030372 | 5 | ... | 2 | 0 | 3 | 2 | 7 | 4 | 9 | 0.042553 | 0.0 | 0.063830 | 0.042553 | 0.148936 | 0.085106 | 0.191489 | 138 | 389 | 221 | 26 | 24 | 116 | 0 | 121 | 87 | 0 | 8 | Benign | Benign | None |
| 1 | Benign | 47 | 19 | 11.531915 | 0 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | 0 | 840 | 3761 | 51 | 1011 | 1030 | 117 | 766 | 148 | 337 | 394 | 77 | 123 | 77 | 0.036167 | 0.057774 | 0.036167 | 12 | ... | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.085106 | 0.042553 | 0.127660 | 138 | 392 | 222 | 26 | 24 | 118 | 0 | 122 | 87 | 0 | 8 | Benign | Benign | None |
| 2 | Benign | 40 | 14 | 14.725000 | 0 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | 0 | 1050 | 3996 | 45 | 784 | 1241 | 100 | 645 | 138 | 369 | 338 | 51 | 89 | 51 | 0.026114 | 0.045571 | 0.026114 | 5 | ... | 0 | 0 | 0 | 0 | 4 | 2 | 5 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.100000 | 0.050000 | 0.125000 | 137 | 395 | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign | Benign | None |
| 3 | Benign | 32 | 13 | 13.500000 | 0 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | 0 | 630 | 2961 | 36 | 654 | 792 | 83 | 567 | 127 | 186 | 242 | 31 | 62 | 31 | 0.021483 | 0.042966 | 0.021483 | 2 | ... | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.125000 | 0.062500 | 0.187500 | 138 | 395 | 222 | 26 | 27 | 118 | 0 | 120 | 88 | 0 | 8 | Benign | Benign | None |
| 4 | Benign | 42 | 16 | 11.452381 | 0 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | 0 | 908 | 3834 | 45 | 1252 | 942 | 103 | 825 | 135 | 375 | 429 | 102 | 143 | 102 | 0.047820 | 0.067042 | 0.047820 | 12 | ... | 4 | 0 | 4 | 4 | 8 | 6 | 10 | 0.086957 | 0.0 | 0.086957 | 0.086957 | 0.173913 | 0.130435 | 0.217391 | 138 | 392 | 222 | 26 | 24 | 118 | 0 | 124 | 87 | 0 | 8 | Benign | Benign | None |
5 rows × 59 columns
Convert the Class and Category to categorical data type.
# df.Class = df.Class.astype('category')
# df.category_new = df.category_new.astype('category')
# df.type = df.type.astype('category')
# # test
# print(df.Class.dtype)
# print(df.category_new.dtype)
# print(df.type.dtype)
Remove the pslist.nprocs64bit, svcscan.interactive_process_services, handles.nport, svcscan.interactive_process_services variables.
df.drop(columns=['pslist.nprocs64bit', 'svcscan.interactive_process_services',
'handles.nport', 'svcscan.interactive_process_services', 'Category'],
inplace=True)
# test
print('Category' in df.columns)
print('handles.nport' in df.columns)
False False
df.head()
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | category_new | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 17 | 10.555556 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | 670 | 3161 | 46 | 716 | 887 | 104 | 671 | 125 | 184 | 257 | 53 | 95 | 53 | 0.030372 | 0.054441 | 0.030372 | 5 | 21 | 30 | 1.250000 | 2 | 0 | 3 | 2 | 7 | 4 | 9 | 0.042553 | 0.0 | 0.063830 | 0.042553 | 0.148936 | 0.085106 | 0.191489 | 138 | 389 | 221 | 26 | 24 | 116 | 121 | 87 | 0 | 8 | Benign | Benign | None |
| 1 | 47 | 19 | 11.531915 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | 840 | 3761 | 51 | 1011 | 1030 | 117 | 766 | 148 | 337 | 394 | 77 | 123 | 77 | 0.036167 | 0.057774 | 0.036167 | 12 | 77 | 72 | 1.714286 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.085106 | 0.042553 | 0.127660 | 138 | 392 | 222 | 26 | 24 | 118 | 122 | 87 | 0 | 8 | Benign | Benign | None |
| 2 | 40 | 14 | 14.725000 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | 1050 | 3996 | 45 | 784 | 1241 | 100 | 645 | 138 | 369 | 338 | 51 | 89 | 51 | 0.026114 | 0.045571 | 0.026114 | 5 | 6 | 30 | 1.250000 | 0 | 0 | 0 | 0 | 4 | 2 | 5 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.100000 | 0.050000 | 0.125000 | 137 | 395 | 222 | 26 | 27 | 118 | 120 | 88 | 0 | 8 | Benign | Benign | None |
| 3 | 32 | 13 | 13.500000 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | 630 | 2961 | 36 | 654 | 792 | 83 | 567 | 127 | 186 | 242 | 31 | 62 | 31 | 0.021483 | 0.042966 | 0.021483 | 2 | 2 | 12 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.125000 | 0.062500 | 0.187500 | 138 | 395 | 222 | 26 | 27 | 118 | 120 | 88 | 0 | 8 | Benign | Benign | None |
| 4 | 42 | 16 | 11.452381 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | 908 | 3834 | 45 | 1252 | 942 | 103 | 825 | 135 | 375 | 429 | 102 | 143 | 102 | 0.047820 | 0.067042 | 0.047820 | 12 | 77 | 72 | 2.000000 | 4 | 0 | 4 | 4 | 8 | 6 | 10 | 0.086957 | 0.0 | 0.086957 | 0.086957 | 0.173913 | 0.130435 | 0.217391 | 138 | 392 | 222 | 26 | 24 | 118 | 124 | 87 | 0 | 8 | Benign | Benign | None |
# save cleaned dataset to a file
df.to_csv('Cleaned_Obfuscated-MalMem2022.csv')
Summary Statistics
df.describe()
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 5.806200e+04 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 | 58062.000000 |
| mean | 41.407547 | 14.709552 | 11.350512 | 247.802426 | 1812.960439 | 43.745115 | 1.027415e+04 | 249.853387 | 901.060625 | 3577.857669 | 44.549309 | 775.187661 | 930.107902 | 102.433140 | 683.968275 | 130.423409 | 291.019221 | 313.088991 | 60.941373 | 100.067996 | 60.943646 | 0.033195 | 0.055230 | 0.033197 | 7.040560 | 977.189160 | 42.466157 | 1.739041 | 1.873807 | 0.002101 | 2.269574 | 1.877459 | 6.272674 | 3.873566 | 8.252730 | 0.040932 | 0.000080 | 0.047660 | 0.041020 | 0.141673 | 0.087892 | 0.187579 | 137.961851 | 391.366935 | 221.410079 | 25.996211 | 25.071389 | 116.887448 | 122.011453 | 86.912955 | 0.000861 | 7.999879 |
| std | 5.777920 | 2.661196 | 1.587684 | 112.285446 | 329.856293 | 5.738297 | 4.884605e+03 | 146.609930 | 3448.012733 | 805.490759 | 5.163729 | 150.528890 | 237.881254 | 9.783108 | 94.434616 | 14.956545 | 144.427593 | 73.200894 | 18.780382 | 21.460897 | 18.778881 | 0.009288 | 0.010143 | 0.009291 | 15.443528 | 6067.036231 | 92.653870 | 2.751470 | 2.999459 | 0.045791 | 4.621438 | 3.021339 | 4.622076 | 2.999466 | 4.737816 | 0.057565 | 0.001275 | 0.066819 | 0.058224 | 0.061351 | 0.055036 | 0.061465 | 0.197402 | 4.538868 | 1.999294 | 0.171573 | 1.531002 | 1.551232 | 2.819533 | 3.115313 | 0.029333 | 0.010979 |
| min | 21.000000 | 8.000000 | 1.650000 | 34.962500 | 670.000000 | 7.333333 | 3.514000e+03 | 71.139241 | 266.000000 | 966.000000 | 22.000000 | 284.000000 | 388.000000 | 57.000000 | 296.000000 | 69.000000 | 50.000000 | 118.000000 | 6.000000 | 16.000000 | 6.000000 | 0.016176 | 0.040526 | 0.016176 | 1.000000 | 1.000000 | 6.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2.000000 | 4.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.025806 | 0.008333 | 0.043750 | 126.000000 | 94.000000 | 55.000000 | 6.000000 | 7.000000 | 26.000000 | 30.000000 | 50.000000 | 0.000000 | 7.000000 |
| 25% | 40.000000 | 12.000000 | 9.973684 | 208.775000 | 1556.000000 | 38.850000 | 8.402000e+03 | 209.745640 | 646.000000 | 2923.000000 | 43.000000 | 675.000000 | 709.000000 | 99.000000 | 614.000000 | 120.000000 | 177.000000 | 259.000000 | 46.000000 | 85.000000 | 46.000000 | 0.028857 | 0.052396 | 0.028858 | 3.000000 | 3.000000 | 18.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4.000000 | 2.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.100000 | 0.048780 | 0.146341 | 138.000000 | 389.000000 | 221.000000 | 26.000000 | 24.000000 | 116.000000 | 121.000000 | 87.000000 | 0.000000 | 8.000000 |
| 50% | 41.000000 | 15.000000 | 11.022727 | 244.783765 | 1739.000000 | 42.977048 | 9.439500e+03 | 248.307436 | 840.000000 | 3176.000000 | 45.000000 | 760.000000 | 852.000000 | 103.000000 | 690.000000 | 131.000000 | 231.000000 | 293.000000 | 57.000000 | 97.000000 | 57.000000 | 0.031410 | 0.054036 | 0.031410 | 4.000000 | 4.000000 | 24.000000 | 1.250000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 5.000000 | 3.000000 | 7.000000 | 0.021573 | 0.000000 | 0.023256 | 0.021616 | 0.119048 | 0.066667 | 0.166177 | 138.000000 | 392.000000 | 222.000000 | 26.000000 | 24.000000 | 118.000000 | 122.000000 | 87.000000 | 0.000000 | 8.000000 |
| 75% | 43.000000 | 16.000000 | 12.867817 | 290.087324 | 2088.000000 | 49.625000 | 1.219600e+04 | 291.438689 | 1081.000000 | 4328.000000 | 46.000000 | 860.000000 | 1170.000000 | 107.000000 | 750.000000 | 142.000000 | 415.000000 | 367.000000 | 75.000000 | 116.000000 | 75.000000 | 0.036430 | 0.056015 | 0.036430 | 5.000000 | 6.000000 | 30.000000 | 1.333333 | 3.000000 | 0.000000 | 3.000000 | 3.000000 | 7.000000 | 5.000000 | 9.000000 | 0.068182 | 0.000000 | 0.069767 | 0.068182 | 0.164873 | 0.113636 | 0.210526 | 138.000000 | 395.000000 | 222.000000 | 26.000000 | 27.000000 | 118.000000 | 123.000000 | 88.000000 | 0.000000 | 8.000000 |
| max | 240.000000 | 72.000000 | 16.818182 | 24845.951220 | 3443.000000 | 53.170732 | 1.047310e+06 | 33784.193550 | 807008.000000 | 7892.000000 | 159.000000 | 2668.000000 | 5637.000000 | 498.000000 | 4268.000000 | 382.000000 | 14687.000000 | 583.000000 | 240.000000 | 264.000000 | 240.000000 | 0.531120 | 0.585062 | 0.531120 | 627.000000 | 220850.000000 | 3762.000000 | 90.666667 | 43.000000 | 1.000000 | 201.000000 | 43.000000 | 205.000000 | 45.000000 | 207.000000 | 0.551282 | 0.043478 | 0.837500 | 1.000000 | 0.854167 | 0.576923 | 0.862500 | 138.000000 | 395.000000 | 222.000000 | 26.000000 | 27.000000 | 118.000000 | 129.000000 | 89.000000 | 1.000000 | 8.000000 |
What is the Proportion of the Class Variable?
plt.figure(figsize=[8,6])
sorted_counts = df.Class.value_counts()
plt.pie(sorted_counts, labels=sorted_counts.index,
startangle=90, counterclock=False, autopct='%.1f')
plt.axis('square')
plt.title('Proportion of Class Label in the Dataset', fontsize=16);
49.7% of the data in the dataset are records of malware attacks. While 50.3% of the data are benign connections. The classes in the dataset are balanced.
What is the proportion of the category_new variable?
# set base color for visualization
base_color = sns.color_palette()[0]
category_count = df.category_new.value_counts()
display(category_count)
plt.figure(figsize=[8,6])
order = category_count.index
def plot_bar(df, x, color, order, title, xlabel, text_height, category_count):
'''Customized function to plot a bar chart.
Parameters
-----------
df: Pandas DataFrame. DataFrame with data of interest
x: str. Variable to be plotted
order: list. List containing the values in the order they should be plotted.
title: str. Title of the plot.
xlabel: str. Label for the x-axis.
text_height: int. Number to adjust the text on each bar in the plot.
category_count: Pandas Series. Series containing the count of each
unique value in the variable x.
'''
sns.countplot(data=df, x=x, color=color, order=order)
plt.xlabel(xlabel, fontsize=14)
plt.title(title, fontsize=16)
for i in range(len(category_count)):
count = category_count.values[i]
plt.text(i, count+text_height, count, ha='center', va='top');
# invoke function
plot_bar(df=df, x='category_new', color=base_color,
category_count=category_count, order=order,
title='Total Count of Category',
xlabel='Category', text_height=900)
Benign 29231 Spyware 9815 Ransomware 9529 Trojan 9487 Name: category_new, dtype: int64
There are 29,231 records of benign connections, 9,815 records of spyware attacks, 9529 records of ransomware attacks and 9487 records of trojan attacks.
What are the different types of ransomware attacks in the dataset?
ransom_df = df.query('category_new == "Ransomware"').copy()
category_count = ransom_df.type.value_counts()
display(category_count)
plt.figure(figsize=[8,6])
order = category_count.index
plot_bar(df=ransom_df, x='type', color=base_color,
category_count=category_count,
order=order, title='Types of Ransomware Attacks',
xlabel='Ransomware Types', text_height=70)
Shade 2128 Ako 2000 Conti 1988 Maze 1754 Pysa 1659 Name: type, dtype: int64
There are five types of ransomware attack in the dataset: shade, ako, conti, maze, pysa.
There are 2128 shade ransomware attacks, 2000 ako ransomware attacks, 1988 conti ransomware attacks, 1754 maze ransomware attacks and 1659 pysa ransomware attacks.
What are the different types of spyware attacks in the dataset?
spy_df = df.query('category_new == "Spyware"').copy()
category_count = spy_df.type.value_counts()
display(category_count)
plt.figure(figsize=[8,6])
order = category_count.index
plot_bar(df=spy_df, x='type', color=base_color,
category_count=category_count,
order=order, title='Types of Spyware Attacks',
xlabel='Spyware Types', text_height=80)
Transponder 2410 180solutions 2000 CWS 2000 Gator 1995 TIBS 1410 Name: type, dtype: int64
There are five types of spyware attack in the dataset: Transponder, 180solutions, CWS, Gator, TIBS.
There are 2410 Transponder spyware attacks, 2000 180solutions spyware attacks, 2000 CWS spyware attacks, 1995 Gator spyware attacks and 1410 TIBS spyware attacks.
What are the different types of trojan attacks in the dataset?
trojan_df = df.query('category_new == "Trojan"').copy()
category_count = trojan_df.type.value_counts()
display(category_count)
plt.figure(figsize=[8,6])
order = category_count.index
plot_bar(df=trojan_df, x='type', color=base_color,
category_count=category_count,
order=order, title='Types of Trojan Attacks',
xlabel='Trojan Types', text_height=70)
Refroso 2000 Scar 2000 Emotet 1967 Zeus 1950 Reconyc 1570 Name: type, dtype: int64
There are five types of trojan attack in the dataset: Refroso, Scar, Emotet, Zeus, Reconyc.
There are 2000 Refroso trojan attacks, 2000 Scar trojan attacks, 1967 Emotet trojan attacks, 1950 Zeus trojan attacks and 1570 Reconyc trojan attacks.
Distribution of the 'pslist.nproc' variable
binsize = np.arange(0, df['pslist.nproc'].max()+5, 5)
df['pslist.nproc'].hist(bins=binsize);
Distribution of the 'svcscan.shared_process_services' variable
binsize = np.arange(0, df['svcscan.shared_process_services'].max()+5, 5)
df['svcscan.shared_process_services'].hist(bins=binsize);
The svcscan.shared_process_services is left skewed.
Distribution of the 'svcscan.kernel_drivers' variable
binsize = np.arange(0, df['svcscan.kernel_drivers'].max()+5, 5)
df['svcscan.kernel_drivers'].hist(bins=binsize);
The svcscan.kernel_drivers is left skewed.
Distribution of the 'handles.nmutant' variable
binsize = np.arange(0, df['handles.nmutant'].max()+50, 50)
df['handles.nmutant'].hist(bins=binsize);
Most of the values fall within 250-300
Distribution of the 'handles.nevent' variable
binsize = np.arange(0, df['handles.nevent'].max()+300, 300)
df['handles.nevent'].hist(bins=binsize);
Distribution of the 'malfind.ninjections' variable
binsize = np.arange(0, df['malfind.ninjections'].max()+50, 50)
df['malfind.ninjections'].hist(bins=binsize);
The malfind.ninjections variable is right-skewed.
In this section, I will take a deep dive into the data to extract some insights from it.
Analysis of the svcscan.kernel_drivers variable
def viobox(df, x, y, suptitle):
'''
df: DataFrame, Dataset
x: str, categorical variable on the x-axis
y: str, numeric variable on the y-axis
suptitle: str, title of the plot
color: str, color of the plot [default is base_color]
'''
base_color = sns.color_palette()[0]
plt.figure(figsize=[20,5])
plt.suptitle(suptitle, fontsize=15)
plt.subplot(1,2,1)
sns.boxplot(data=df, x=x, y=y, color=base_color);
plt.subplot(1,2,2)
sns.violinplot(data=df, x=x, y=y, color=base_color);
viobox(df=df, x='Class', y='svcscan.kernel_drivers',
suptitle='Analysis of the svcscan.kernel_drivers variable')
The plot above on the left is a box plot, while the plot on the right is a violin plot. The svcscan.kernel_drivers variable of malware attacks have values less than 200. If the value of the svcscan.kernel_drivers variable is below 200, there is a high chance that it is a malware attack.
# Separate benign and malware records into two separate dataframes
df_benign = df.query("Class=='Benign'")
df_malicious = df.query("Class=='Malware'")
df_benign['svcscan.kernel_drivers'].describe()
count 29231.000000 mean 221.990524 std 0.127124 min 220.000000 25% 222.000000 50% 222.000000 75% 222.000000 max 222.000000 Name: svcscan.kernel_drivers, dtype: float64
df_malicious['svcscan.kernel_drivers'].describe()
count 28831.000000 mean 220.821581 std 2.710280 min 55.000000 25% 221.000000 50% 221.000000 75% 221.000000 max 222.000000 Name: svcscan.kernel_drivers, dtype: float64
Analysis of the svcscan.nservices variable
viobox(df=df, x='Class', y='svcscan.nservices',
suptitle='Analysis of the svcscan.nservices variable')
The svcscan.nservices variable of malware attacks have values less than 350. If the value of the svcscan.nservices variable is below 350, there is a high chance that it is a malware attack.
df_benign['svcscan.nservices'].describe()
count 29231.000000 mean 394.205433 std 1.352629 min 388.000000 25% 392.000000 50% 395.000000 75% 395.000000 max 395.000000 Name: svcscan.nservices, dtype: float64
df_malicious['svcscan.nservices'].describe()
count 28831.000000 mean 388.489057 std 4.814831 min 94.000000 25% 389.000000 50% 389.000000 75% 389.000000 max 395.000000 Name: svcscan.nservices, dtype: float64
Analysis of the svcscan.shared_process_services variable
viobox(df=df, x='Class', y='svcscan.shared_process_services',
suptitle='Analysis of the svcscan.shared_process_services variable')
The svcscan.shared_process_services variable of malware attacks have values less than 100. If the value of the svcscan.shared_process_services variable is below 100, there is a high chance that it is a malware attack.
df_benign['svcscan.shared_process_services'].describe()
count 29231.000000 mean 117.981766 std 0.251528 min 114.000000 25% 118.000000 50% 118.000000 75% 118.000000 max 118.000000 Name: svcscan.shared_process_services, dtype: float64
df_malicious['svcscan.shared_process_services'].describe()
count 28831.000000 mean 115.777947 std 1.528642 min 26.000000 25% 116.000000 50% 116.000000 75% 116.000000 max 118.000000 Name: svcscan.shared_process_services, dtype: float64
Analysis of the handles.nevent variable
viobox(df=df, x='Class', y='handles.nevent',
suptitle='Analysis of the handles.nevent variable')
For the variable handles.nevent, most of the malware attacks have a value of 3000 as indicated in the violin plot above. Most benign records are within the range of 3500-5000. Values below 3000 could be malicious attacks.
df_benign['handles.nevent'].describe()
count 29231.000000 mean 4278.939722 std 465.809494 min 1636.000000 25% 4083.000000 50% 4322.000000 75% 4506.000000 max 7892.000000 Name: handles.nevent, dtype: float64
df_malicious['handles.nevent'].describe()
count 28831.000000 mean 2867.048836 std 288.178616 min 966.000000 25% 2828.000000 50% 2925.000000 75% 3012.000000 max 4357.000000 Name: handles.nevent, dtype: float64
Analysis of the 'dlllist.avg_dlls_per_proc variable
viobox(df=df, x='Class', y='dlllist.avg_dlls_per_proc',
suptitle='Analysis of the dlllist.avg_dlls_per_proc variable')
For the variable dlllist.avg_dlls_per_proc, most of the malware attacks have values ranging from 35 - 40 as indicated in the violin plot above. Most benign records are above the value 40. If the value of dlllist.avg_dlls_per_proc is below 40, there is a high chance that it is a malicious attacks.
viobox(df=df, x='category_new', y='dlllist.avg_dlls_per_proc',
suptitle='Analysis of the dlllist.avg_dlls_per_proc variable')
df_benign['dlllist.avg_dlls_per_proc'].describe()
count 29231.000000 mean 48.923975 std 2.711583 min 34.538462 25% 47.309795 50% 49.597260 75% 50.727273 max 53.170732 Name: dlllist.avg_dlls_per_proc, dtype: float64
df_malicious['dlllist.avg_dlls_per_proc'].describe()
count 28831.000000 mean 38.494404 std 2.023574 min 7.333333 25% 37.863636 50% 38.840909 75% 39.184211 max 46.968254 Name: dlllist.avg_dlls_per_proc, dtype: float64
Analysis of the 'dlllist.ndlls' variable
viobox(df=df, x='Class', y='dlllist.ndlls',
suptitle='Analysis of the dlllist.ndlls variable')
For the variable dlllist.ndlls, most of the malware attacks have values ranging from 1400 - 1700 as indicated in the violin plot above. Most benign records are within the range of 2000-2300. If the value of dlllist.ndlls is below 1500, there is a high chance that it is a malicious attacks.
df_benign['dlllist.ndlls'].describe()
count 29231.000000 mean 2082.665253 std 213.655439 min 911.000000 25% 2000.000000 50% 2086.000000 75% 2154.000000 max 3243.000000 Name: dlllist.ndlls, dtype: float64
df_malicious['dlllist.ndlls'].describe()
count 28831.000000 mean 1539.513753 std 155.928212 min 670.000000 25% 1474.500000 50% 1557.000000 75% 1619.000000 max 3443.000000 Name: dlllist.ndlls, dtype: float64
Analysis of the ''handles.nkey'' variable
viobox(df=df, x='Class', y='handles.nkey',
suptitle='Analysis of the handles.nkey variable')
df_benign['handles.nkey'].describe()
count 29231.000000 mean 879.413773 std 115.842483 min 327.000000 25% 803.000000 50% 857.000000 75% 922.000000 max 1525.000000 Name: handles.nkey, dtype: float64
df_malicious['handles.nkey'].describe()
count 28831.000000 mean 669.515521 std 99.228141 min 284.000000 25% 664.000000 50% 676.000000 75% 696.000000 max 2668.000000 Name: handles.nkey, dtype: float64
Analysis of the pslist.avg_handlers variable
viobox(df=df, x='Class', y='pslist.avg_handlers',
suptitle='Analysis of the pslist.avg_handlers variable')
Analysis of the handles.avg_handles_per_proc variable
viobox(df=df, x='Class', y='handles.avg_handles_per_proc',
suptitle='Analysis of the handles.avg_handles_per_proc variable');
# Zooming In
plt.figure(figsize=[20,5])
plt.suptitle('Analysis of the handles.avg_handles_per_proc variable', fontsize=15)
plt.subplot(1,2,1)
sns.boxplot(data=df, x='Class', y='handles.avg_handles_per_proc', color=base_color)
plt.ylim(0,500);
plt.subplot(1,2,2)
sns.violinplot(data=df, x='Class', y='handles.avg_handles_per_proc', color=base_color)
plt.ylim(0,500);
Most benign values of the handles.avg_handles_per_proc variable are within the range of 200-350. If the value is below 200 or above 350, it could be a malware attack.
df_benign['handles.avg_handles_per_proc'].describe()
count 29231.000000 mean 286.780925 std 17.466239 min 208.270833 25% 275.509461 50% 291.257965 75% 301.048507 max 318.162791 Name: handles.avg_handles_per_proc, dtype: float64
df_malicious['handles.avg_handles_per_proc'].describe()
count 28831.000000 mean 212.413518 std 200.484932 min 71.139241 25% 203.136364 50% 209.674419 75% 215.054054 max 33784.193550 Name: handles.avg_handles_per_proc, dtype: float64
Correlation Analysis
sns.relplot(data=df, x='svcscan.nservices', y='svcscan.kernel_drivers', hue='category_new')
plt.title('Relationship between svcscan.nservices and svcscan.kernel_drivers variables');
There is a high positive correlation between svcscan.nservices and svcscan.kernel_drivers. The only points showing are the malware.
Plot Matrices
g = sns.PairGrid(data=df, vars=['malfind.ninjections', 'malfind.commitCharge',
'malfind.protection', 'malfind.uniqueInjections'])
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter);
There is a high correlation between mlfind.ninjections and malfind.protection.
g = sns.PairGrid(data=df, vars=['svcscan.nservices', 'svcscan.kernel_drivers',
'svcscan.fs_drivers', 'svcscan.process_services',
'svcscan.shared_process_services', 'svcscan.nactive'])
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter);
Relationship between svcscan.process_services and svcscan.nactive variables
sns.relplot(data=df, x='svcscan.process_services', y='svcscan.nactive', hue='category_new')
plt.title('Relationship between svcscan.process_services and svcscan.nactive variables');
sns.relplot(data=df, x='svcscan.process_services', y='svcscan.nactive', hue='Class')
plt.title('Relationship between svcscan.process_services and svcscan.nactive variables');
Most of the benign records have values greater than 22.5 of the svcscan.process_services and 110 of the svcscan.nactive variable. Most of the ransomware attacks are within 17.5 - 25 value of the svcscan.process_services variable and 90 - 120 of the svcscan.nactive variable. From the scatter plot, it is easy to identify the clusters.
Relationship between svcscan.shared_process_services and svcscan.process_services
sns.relplot(data=df, x='svcscan.shared_process_services', y='svcscan.process_services', hue='Class')
plt.title('Relationship between svcscan.shared_process_services and svcscan.process_services variables');
From the scatter plot between the svcscan.shared_process_services and svcscan.process_services variables, we see that values below 24 (of the svcscan.process_services variable) are malware attacks. While values above 24 are benign. Also, the benign records are also above 110 (of svcscan.shared_process_services variable).
Relationship between handles.nkey and svcscan.process_services
sns.relplot(data=df, y='handles.nkey', x='svcscan.process_services', hue='Class')
plt.title('Relationship between handles.nkey and svcscan.process_services variables');
Most of the benign records are above 24 (of the svcscan.process_services variable) but below 1500 of the handles.nkey.
Relationship between handles.nkey and svcscan.shared_process_services
sns.relplot(data=df, y='handles.nkey', x='svcscan.shared_process_services', hue='Class')
plt.title('Relationship between handles.nkey and svcscan.shared_process_services variables');
The values above 118 of the svcscan.shared_process_services variable but below 700 of the handles.nkey variable are benign. Also, the values above 118 of the svcscan.shared_process_services variable but above 1200 of the handles.nkey variable are benign. Most of the values below 118 of the svcscan.shared_process_services variable are malware attacks.
Relationship between dlllist.avg_dlls_per_proc and svcscan.kernel_drivers
sns.relplot(data=df, x='dlllist.avg_dlls_per_proc', y='svcscan.kernel_drivers', hue='Class')
plt.title('Relationship between dlllist.avg_dlls_per_proc and svcscan.kernel_drivers variables');
Most of the benign values are above 45 (of the dlllist.avg_dlls_per_proc) and above 200 of the svcscan.kernel_drivers variable.
Relationship between handles.nevent and handles.nkey
sns.relplot(data=df, y='handles.nevent', x='handles.nkey', hue='Class')
plt.title('Relationship between handles.nevent and handles.nkey variables');
The benign values are above 3500 of the handles.nevent variable but between the range of 600 - 1600 of handles.nkey variable. Malware attacks are below 4000 of the handles.nevent variable.
sns.relplot(data=df, x='handles.nevent', y='handles.ndirectory', hue='Class')
plt.title('Relationship between handles.nevent and handles.ndirectory variables');
Most of the benign class are below 200 of the handles.ndirectory variable, but above 4000 of the handles.nevent variable. All the values above 200 of the handles.ndirectory variable belong to the malware class.
Relationship between ldrmodules.not_in_load_avg and dlllist.avg_dlls_per_proc variables
sns.relplot(data=df, y='ldrmodules.not_in_load_avg', x='dlllist.avg_dlls_per_proc', hue='Class')
plt.title('Relationship between ldrmodules.not_in_load_avg and dlllist.avg_dlls_per_proc variables');
Most of the records in the benign class have values above 40 for the dlllist.avg_dlls_per_proc but below 0.1 (ldrmodules.not_in_load_avg). All values above 0.1 of the ldrmodules.not_in_load_avg variable are malware attacks. Also, values below 30 of the dlllist.avg_dlls_per_proc variable are malware attacks.
Relationship between handles.nevent and handles.nmutant
sns.relplot(data=df, x='handles.nevent', y='handles.nmutant', hue='Class')
plt.title('Relationship between handles.nevent and handles.nmutant variables');
The benign class have values greater than 3000 (of the handles.nevent) and greater than 300 of the handles.nmutant variable. All the records that have values greater than 4500 (of the handles.nevent) are benign. All values that are less than 3000 of the (of the handles.nevent) are malware. There is a small cluster of malware class between 500 - 600 of the handles.nmutant variable and 2500 - 3500 of the handles.nevent variable.
g = sns.PairGrid(data=df, vars=['handles.nhandles', 'handles.avg_handles_per_proc', 'handles.nfile',
'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
'handles.nsection', 'handles.nmutant' ])
g.map_diag(plt.hist)
g.map_offdiag(plt.scatter);
Correlation Heatmap
# Break columns into subset for the correlation heatmap
subset_1 = df.columns[:25]
subset_2 = df.columns[25:]
print(subset_1)
print(subset_2)
Index(['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
'pslist.avg_handlers', 'dlllist.ndlls', 'dlllist.avg_dlls_per_proc',
'handles.nhandles', 'handles.avg_handles_per_proc', 'handles.nfile',
'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
'ldrmodules.not_in_mem_avg', 'malfind.ninjections'],
dtype='object')
Index(['malfind.commitCharge', 'malfind.protection',
'malfind.uniqueInjections', 'psxview.not_in_pslist',
'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
'psxview.not_in_session', 'psxview.not_in_deskthrd',
'psxview.not_in_pslist_false_avg',
'psxview.not_in_eprocess_pool_false_avg',
'psxview.not_in_ethread_pool_false_avg',
'psxview.not_in_pspcid_list_false_avg',
'psxview.not_in_csrss_handles_false_avg',
'psxview.not_in_session_false_avg', 'psxview.not_in_deskthrd_false_avg',
'modules.nmodules', 'svcscan.nservices', 'svcscan.kernel_drivers',
'svcscan.fs_drivers', 'svcscan.process_services',
'svcscan.shared_process_services', 'svcscan.nactive',
'callbacks.ncallbacks', 'callbacks.nanonymous', 'callbacks.ngeneric',
'Class', 'category_new', 'type'],
dtype='object')
# Pearson's Corrrelation Plot with Seaborn
fig, ax = plt.subplots(figsize=(40,40))
sns.heatmap(df.corr(method='pearson'), annot=True, linewidths=.5, ax=ax);
# Pearson's Corrrelation Plot with Seaborn
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(df[subset_1].corr(method='pearson'), annot=True, linewidths=.5, ax=ax);
# Pearson's Corrrelation Plot with Seaborn
fig, ax = plt.subplots(figsize=(30,30))
sns.heatmap(df[subset_2].corr(method='pearson'), annot=True, linewidths=.5, ax=ax);
There is a high positive correlation between:
In this section, I will train a linear classifier and a non-linear classifier to detect malicious and benign connections.
Label Encoding
Convert the 'Benign' class to 0 and the 'Malware' class to 1
mapping = {'Benign':0,
'Malware':1}
df.Class = df.Class.map(mapping)
df.Class.head()
0 0 1 0 2 0 3 0 4 0 Name: Class, dtype: int64
# create a copy of the dataframe for the second experiment
df2 = df.copy()
df.to_csv('CleanedDataBeforeTraining.csv')
df.columns
Index(['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
'pslist.avg_handlers', 'dlllist.ndlls', 'dlllist.avg_dlls_per_proc',
'handles.nhandles', 'handles.avg_handles_per_proc', 'handles.nfile',
'handles.nevent', 'handles.ndesktop', 'handles.nkey', 'handles.nthread',
'handles.ndirectory', 'handles.nsemaphore', 'handles.ntimer',
'handles.nsection', 'handles.nmutant', 'ldrmodules.not_in_load',
'ldrmodules.not_in_init', 'ldrmodules.not_in_mem',
'ldrmodules.not_in_load_avg', 'ldrmodules.not_in_init_avg',
'ldrmodules.not_in_mem_avg', 'malfind.ninjections',
'malfind.commitCharge', 'malfind.protection',
'malfind.uniqueInjections', 'psxview.not_in_pslist',
'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
'psxview.not_in_pspcid_list', 'psxview.not_in_csrss_handles',
'psxview.not_in_session', 'psxview.not_in_deskthrd',
'psxview.not_in_pslist_false_avg',
'psxview.not_in_eprocess_pool_false_avg',
'psxview.not_in_ethread_pool_false_avg',
'psxview.not_in_pspcid_list_false_avg',
'psxview.not_in_csrss_handles_false_avg',
'psxview.not_in_session_false_avg', 'psxview.not_in_deskthrd_false_avg',
'modules.nmodules', 'svcscan.nservices', 'svcscan.kernel_drivers',
'svcscan.fs_drivers', 'svcscan.process_services',
'svcscan.shared_process_services', 'svcscan.nactive',
'callbacks.ncallbacks', 'callbacks.nanonymous', 'callbacks.ngeneric',
'Class', 'category_new', 'type'],
dtype='object')
Separate the features and target variable
The target variable is the Class variable.
X = df.copy().drop(['category_new', 'type', 'Class'], axis=1)
y = df.Class.values
display(X)
y
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 17 | 10.555556 | 202.844444 | 1694 | 38.500000 | 9129 | 212.302326 | 670 | 3161 | 46 | 716 | 887 | 104 | 671 | 125 | 184 | 257 | 53 | 95 | 53 | 0.030372 | 0.054441 | 0.030372 | 5 | 21 | 30 | 1.250000 | 2 | 0 | 3 | 2 | 7 | 4 | 9 | 0.042553 | 0.0 | 0.063830 | 0.042553 | 0.148936 | 0.085106 | 0.191489 | 138 | 389 | 221 | 26 | 24 | 116 | 121 | 87 | 0 | 8 |
| 1 | 47 | 19 | 11.531915 | 242.234043 | 2074 | 44.127660 | 11385 | 242.234043 | 840 | 3761 | 51 | 1011 | 1030 | 117 | 766 | 148 | 337 | 394 | 77 | 123 | 77 | 0.036167 | 0.057774 | 0.036167 | 12 | 77 | 72 | 1.714286 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.085106 | 0.042553 | 0.127660 | 138 | 392 | 222 | 26 | 24 | 118 | 122 | 87 | 0 | 8 |
| 2 | 40 | 14 | 14.725000 | 288.225000 | 1932 | 48.300000 | 11529 | 288.225000 | 1050 | 3996 | 45 | 784 | 1241 | 100 | 645 | 138 | 369 | 338 | 51 | 89 | 51 | 0.026114 | 0.045571 | 0.026114 | 5 | 6 | 30 | 1.250000 | 0 | 0 | 0 | 0 | 4 | 2 | 5 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.100000 | 0.050000 | 0.125000 | 137 | 395 | 222 | 26 | 27 | 118 | 120 | 88 | 0 | 8 |
| 3 | 32 | 13 | 13.500000 | 264.281250 | 1445 | 45.156250 | 8457 | 264.281250 | 630 | 2961 | 36 | 654 | 792 | 83 | 567 | 127 | 186 | 242 | 31 | 62 | 31 | 0.021483 | 0.042966 | 0.021483 | 2 | 2 | 12 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.125000 | 0.062500 | 0.187500 | 138 | 395 | 222 | 26 | 27 | 118 | 120 | 88 | 0 | 8 |
| 4 | 42 | 16 | 11.452381 | 281.333333 | 2067 | 49.214286 | 11816 | 281.333333 | 908 | 3834 | 45 | 1252 | 942 | 103 | 825 | 135 | 375 | 429 | 102 | 143 | 102 | 0.047820 | 0.067042 | 0.047820 | 12 | 77 | 72 | 2.000000 | 4 | 0 | 4 | 4 | 8 | 6 | 10 | 0.086957 | 0.0 | 0.086957 | 0.086957 | 0.173913 | 0.130435 | 0.217391 | 138 | 392 | 222 | 26 | 24 | 118 | 124 | 87 | 0 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 58591 | 37 | 15 | 10.108108 | 215.486487 | 1453 | 39.270270 | 7973 | 215.486487 | 630 | 2819 | 40 | 668 | 664 | 92 | 596 | 113 | 161 | 230 | 43 | 79 | 43 | 0.029292 | 0.053815 | 0.029292 | 3 | 3 | 18 | 1.000000 | 1 | 0 | 1 | 1 | 5 | 3 | 7 | 0.026316 | 0.0 | 0.026316 | 0.026316 | 0.131579 | 0.078947 | 0.184211 | 138 | 389 | 221 | 26 | 24 | 116 | 120 | 86 | 0 | 8 |
| 58592 | 37 | 14 | 9.945946 | 190.216216 | 1347 | 36.405405 | 7038 | 190.216216 | 603 | 2394 | 39 | 555 | 624 | 91 | 482 | 102 | 111 | 203 | 43 | 79 | 43 | 0.030958 | 0.056875 | 0.030958 | 3 | 3 | 18 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.108108 | 0.054054 | 0.162162 | 138 | 389 | 221 | 26 | 24 | 116 | 116 | 88 | 0 | 8 |
| 58593 | 38 | 15 | 9.842105 | 210.026316 | 1448 | 38.105263 | 7982 | 215.729730 | 624 | 2816 | 40 | 673 | 661 | 92 | 596 | 113 | 167 | 230 | 43 | 79 | 43 | 0.029292 | 0.053815 | 0.029292 | 3 | 3 | 18 | 1.000000 | 2 | 0 | 3 | 2 | 7 | 4 | 9 | 0.050000 | 0.0 | 0.075000 | 0.050000 | 0.175000 | 0.100000 | 0.225000 | 138 | 389 | 221 | 26 | 24 | 116 | 120 | 88 | 0 | 8 |
| 58594 | 37 | 15 | 10.243243 | 215.513513 | 1452 | 39.243243 | 7974 | 215.513513 | 632 | 2819 | 40 | 668 | 667 | 92 | 596 | 113 | 161 | 230 | 43 | 79 | 43 | 0.029392 | 0.053999 | 0.029392 | 3 | 3 | 18 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.108108 | 0.054054 | 0.162162 | 138 | 389 | 221 | 26 | 24 | 116 | 120 | 87 | 0 | 8 |
| 58595 | 38 | 15 | 9.868421 | 213.026316 | 1487 | 39.131579 | 8095 | 213.026316 | 637 | 2843 | 41 | 683 | 669 | 94 | 598 | 116 | 164 | 236 | 44 | 81 | 44 | 0.029255 | 0.053856 | 0.029255 | 3 | 3 | 18 | 1.000000 | 3 | 0 | 3 | 3 | 7 | 5 | 9 | 0.073171 | 0.0 | 0.073171 | 0.073171 | 0.170732 | 0.121951 | 0.219512 | 138 | 389 | 221 | 26 | 24 | 116 | 120 | 86 | 0 | 8 |
58062 rows × 52 columns
array([0, 0, 0, ..., 1, 1, 1], dtype=int64)
Split Data into the training set and test set
70% of the data will be used for training the models. While 30% will be used to evaluate the models.
from sklearn.model_selection import train_test_split
SEED = 20
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=SEED, stratify=y)
# Check
print("Training set")
print("Training set size", X_train.shape)
display(X_train)
display(y_train)
print("Testing set")
print("Testing set size", X_test.shape)
display(X_test)
display(y_test)
Training set Training set size (40643, 52)
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36982 | 44 | 17 | 9.636364 | 200.931818 | 1674 | 38.045455 | 8841 | 200.931818 | 671 | 3076 | 47 | 699 | 784 | 109 | 653 | 126 | 182 | 269 | 51 | 94 | 51 | 0.029582 | 0.054524 | 0.029582 | 4 | 4 | 24 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.090909 | 0.045455 | 0.136364 | 138 | 389 | 221 | 26 | 24 | 116 | 122 | 86 | 0 | 8 |
| 48875 | 28 | 10 | 12.964286 | 260.107143 | 1258 | 44.928571 | 7283 | 260.107143 | 533 | 2606 | 31 | 549 | 646 | 73 | 595 | 109 | 135 | 216 | 29 | 56 | 29 | 0.022551 | 0.043546 | 0.022551 | 2 | 2 | 12 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 5 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.142857 | 0.071429 | 0.178571 | 138 | 389 | 221 | 26 | 24 | 116 | 117 | 88 | 0 | 8 |
| 53467 | 35 | 14 | 10.400000 | 215.400000 | 1384 | 39.542857 | 7539 | 215.400000 | 627 | 2633 | 37 | 599 | 637 | 88 | 572 | 107 | 138 | 223 | 43 | 77 | 43 | 0.030561 | 0.054726 | 0.030561 | 2 | 2 | 12 | 1.000000 | 2 | 0 | 2 | 2 | 6 | 4 | 8 | 0.054054 | 0.0 | 0.054054 | 0.054054 | 0.162162 | 0.108108 | 0.216216 | 138 | 389 | 221 | 26 | 24 | 116 | 120 | 87 | 0 | 8 |
| 57183 | 39 | 15 | 10.743590 | 217.384615 | 1558 | 39.948718 | 8478 | 217.384615 | 643 | 2968 | 42 | 655 | 776 | 99 | 634 | 120 | 177 | 259 | 46 | 84 | 46 | 0.028696 | 0.052402 | 0.028696 | 3 | 3 | 18 | 1.000000 | 3 | 0 | 3 | 3 | 7 | 5 | 9 | 0.071429 | 0.0 | 0.071429 | 0.071429 | 0.166667 | 0.119048 | 0.214286 | 138 | 389 | 221 | 26 | 24 | 116 | 123 | 87 | 0 | 8 |
| 55768 | 37 | 15 | 10.243243 | 215.054054 | 1446 | 39.081081 | 7957 | 215.054054 | 631 | 2813 | 40 | 667 | 666 | 92 | 596 | 113 | 161 | 230 | 43 | 79 | 43 | 0.029472 | 0.054147 | 0.029472 | 3 | 3 | 18 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.108108 | 0.054054 | 0.162162 | 138 | 389 | 221 | 26 | 24 | 116 | 119 | 86 | 0 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2764 | 42 | 12 | 13.316530 | 301.911126 | 2194 | 52.107325 | 12714 | 302.717002 | 1093 | 4469 | 46 | 941 | 1223 | 105 | 773 | 147 | 416 | 426 | 88 | 126 | 88 | 0.040050 | 0.057733 | 0.040050 | 6 | 7 | 40 | 1.176174 | 0 | 0 | 1 | 0 | 5 | 2 | 7 | 0.020603 | 0.0 | 0.023256 | 0.020603 | 0.116279 | 0.067114 | 0.162791 | 138 | 395 | 222 | 26 | 27 | 118 | 124 | 88 | 0 | 8 |
| 22016 | 50 | 18 | 11.985931 | 271.454189 | 2375 | 47.500000 | 13572 | 271.454189 | 1146 | 4749 | 56 | 1043 | 1264 | 121 | 809 | 149 | 461 | 424 | 106 | 154 | 106 | 0.043621 | 0.063374 | 0.043621 | 8 | 10 | 48 | 1.333333 | 2 | 0 | 2 | 2 | 6 | 4 | 8 | 0.038462 | 0.0 | 0.038462 | 0.038462 | 0.115385 | 0.076923 | 0.153846 | 138 | 392 | 222 | 26 | 24 | 118 | 126 | 87 | 0 | 8 |
| 49923 | 38 | 15 | 9.815789 | 209.447368 | 1446 | 38.052632 | 7960 | 215.135135 | 624 | 2813 | 40 | 667 | 660 | 92 | 596 | 113 | 160 | 230 | 43 | 79 | 43 | 0.029372 | 0.053962 | 0.029372 | 3 | 3 | 18 | 1.000000 | 3 | 0 | 4 | 3 | 8 | 5 | 10 | 0.073171 | 0.0 | 0.097561 | 0.073171 | 0.195122 | 0.121951 | 0.243902 | 138 | 389 | 221 | 26 | 24 | 116 | 119 | 86 | 0 | 8 |
| 57132 | 44 | 17 | 9.795455 | 200.659091 | 1630 | 37.045455 | 8786 | 204.325581 | 668 | 3078 | 47 | 682 | 784 | 107 | 650 | 126 | 181 | 269 | 69 | 105 | 69 | 0.043152 | 0.065666 | 0.043152 | 3 | 3 | 18 | 1.000000 | 0 | 0 | 0 | 0 | 4 | 2 | 9 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.090909 | 0.045455 | 0.204545 | 138 | 389 | 221 | 26 | 24 | 116 | 122 | 54 | 0 | 8 |
| 15985 | 42 | 12 | 13.297032 | 302.387573 | 2177 | 51.855969 | 12700 | 302.387573 | 1140 | 4506 | 46 | 877 | 1224 | 105 | 771 | 142 | 414 | 402 | 78 | 118 | 78 | 0.036532 | 0.055141 | 0.036532 | 5 | 6 | 30 | 1.250000 | 1 | 0 | 1 | 1 | 5 | 3 | 7 | 0.023256 | 0.0 | 0.023256 | 0.023256 | 0.116279 | 0.069767 | 0.162791 | 138 | 395 | 222 | 26 | 27 | 118 | 123 | 88 | 0 | 8 |
40643 rows × 52 columns
array([1, 1, 1, ..., 1, 1, 0], dtype=int64)
Testing set Testing set size (17419, 52)
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | dlllist.avg_dlls_per_proc | handles.nhandles | handles.avg_handles_per_proc | handles.nfile | handles.nevent | handles.ndesktop | handles.nkey | handles.nthread | handles.ndirectory | handles.nsemaphore | handles.ntimer | handles.nsection | handles.nmutant | ldrmodules.not_in_load | ldrmodules.not_in_init | ldrmodules.not_in_mem | ldrmodules.not_in_load_avg | ldrmodules.not_in_init_avg | ldrmodules.not_in_mem_avg | malfind.ninjections | malfind.commitCharge | malfind.protection | malfind.uniqueInjections | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_pspcid_list | psxview.not_in_csrss_handles | psxview.not_in_session | psxview.not_in_deskthrd | psxview.not_in_pslist_false_avg | psxview.not_in_eprocess_pool_false_avg | psxview.not_in_ethread_pool_false_avg | psxview.not_in_pspcid_list_false_avg | psxview.not_in_csrss_handles_false_avg | psxview.not_in_session_false_avg | psxview.not_in_deskthrd_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.fs_drivers | svcscan.process_services | svcscan.shared_process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21543 | 31 | 11 | 13.703655 | 305.364025 | 1571 | 50.677419 | 9467 | 315.576159 | 740 | 3196 | 33 | 757 | 840 | 82 | 685 | 123 | 262 | 344 | 56 | 83 | 56 | 0.036152 | 0.053583 | 0.036152 | 3 | 4 | 18 | 1.500000 | 8 | 0 | 9 | 8 | 13 | 10 | 15 | 0.205128 | 0.0 | 0.230769 | 0.205128 | 0.333333 | 0.256410 | 0.384615 | 138 | 395 | 222 | 26 | 27 | 118 | 123 | 88 | 0 | 8 |
| 58064 | 41 | 16 | 9.853659 | 208.853658 | 1613 | 39.341463 | 8563 | 208.853658 | 649 | 2989 | 44 | 675 | 727 | 103 | 642 | 126 | 181 | 267 | 45 | 85 | 45 | 0.027190 | 0.051360 | 0.027190 | 3 | 3 | 18 | 1.000000 | 4 | 0 | 4 | 4 | 8 | 6 | 10 | 0.088889 | 0.0 | 0.088889 | 0.088889 | 0.177778 | 0.133333 | 0.222222 | 138 | 389 | 221 | 26 | 24 | 116 | 123 | 86 | 0 | 8 |
| 47128 | 40 | 15 | 10.650000 | 214.450000 | 1575 | 39.375000 | 8578 | 214.450000 | 641 | 2987 | 43 | 665 | 788 | 102 | 638 | 123 | 179 | 260 | 46 | 85 | 46 | 0.028395 | 0.052469 | 0.028395 | 3 | 3 | 18 | 1.000000 | 5 | 0 | 5 | 5 | 9 | 7 | 11 | 0.111111 | 0.0 | 0.111111 | 0.111111 | 0.200000 | 0.155556 | 0.244444 | 138 | 389 | 221 | 26 | 24 | 116 | 123 | 87 | 0 | 8 |
| 16592 | 40 | 12 | 13.308599 | 301.008599 | 2001 | 50.025000 | 12040 | 301.008599 | 1125 | 4340 | 44 | 795 | 1163 | 101 | 738 | 138 | 449 | 336 | 63 | 99 | 63 | 0.032357 | 0.050847 | 0.032357 | 3 | 4 | 18 | 1.500000 | 2 | 0 | 2 | 2 | 6 | 4 | 8 | 0.047619 | 0.0 | 0.047619 | 0.047619 | 0.142857 | 0.095238 | 0.190476 | 138 | 395 | 222 | 26 | 27 | 118 | 123 | 88 | 0 | 8 |
| 2763 | 42 | 17 | 11.608303 | 262.875666 | 1875 | 44.390332 | 11104 | 266.742647 | 898 | 3779 | 46 | 909 | 972 | 103 | 736 | 131 | 370 | 396 | 78 | 118 | 78 | 0.041367 | 0.062237 | 0.041367 | 4 | 5 | 24 | 1.594609 | 9 | 0 | 10 | 9 | 14 | 11 | 16 | 0.180523 | 0.0 | 0.190844 | 0.180523 | 0.268157 | 0.219179 | 0.315820 | 138 | 392 | 222 | 26 | 24 | 118 | 126 | 87 | 0 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 560 | 40 | 12 | 13.725000 | 302.350000 | 2001 | 50.025000 | 12094 | 302.350000 | 1126 | 4352 | 44 | 795 | 1195 | 101 | 738 | 138 | 452 | 336 | 63 | 99 | 63 | 0.032357 | 0.050847 | 0.032357 | 3 | 4 | 18 | 1.500000 | 2 | 0 | 2 | 2 | 6 | 4 | 8 | 0.047619 | 0.0 | 0.047619 | 0.047619 | 0.142857 | 0.095238 | 0.190476 | 138 | 395 | 222 | 26 | 27 | 118 | 123 | 88 | 0 | 8 |
| 22204 | 66 | 18 | 10.938011 | 247.650504 | 2911 | 44.394689 | 16274 | 264.126011 | 1440 | 5933 | 66 | 1116 | 1596 | 141 | 891 | 154 | 606 | 477 | 106 | 165 | 106 | 0.036042 | 0.056359 | 0.036042 | 9 | 47 | 55 | 1.759675 | 0 | 0 | 4 | 0 | 8 | 2 | 10 | 0.000000 | 0.0 | 0.059493 | 0.000000 | 0.120287 | 0.030397 | 0.147851 | 138 | 392 | 222 | 26 | 24 | 118 | 125 | 87 | 0 | 8 |
| 15933 | 40 | 12 | 12.700000 | 292.175287 | 1985 | 49.625000 | 11687 | 292.175287 | 1106 | 4129 | 44 | 803 | 1089 | 101 | 729 | 141 | 435 | 333 | 64 | 100 | 64 | 0.033126 | 0.051760 | 0.033126 | 3 | 4 | 18 | 1.500000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.100000 | 0.050000 | 0.150000 | 138 | 395 | 222 | 26 | 27 | 118 | 123 | 88 | 0 | 8 |
| 57302 | 44 | 17 | 9.590909 | 196.795455 | 1619 | 36.795455 | 8589 | 204.500000 | 668 | 2996 | 46 | 686 | 755 | 107 | 613 | 125 | 183 | 265 | 61 | 99 | 61 | 0.038341 | 0.062225 | 0.038341 | 2 | 2 | 12 | 1.000000 | 1 | 0 | 1 | 1 | 5 | 3 | 10 | 0.022222 | 0.0 | 0.022222 | 0.022222 | 0.111111 | 0.066667 | 0.222222 | 138 | 389 | 221 | 26 | 24 | 116 | 122 | 55 | 0 | 8 |
| 13364 | 40 | 12 | 12.731670 | 288.333983 | 1985 | 49.038099 | 11674 | 291.868663 | 1111 | 4090 | 44 | 803 | 1109 | 101 | 729 | 141 | 431 | 333 | 64 | 100 | 64 | 0.033126 | 0.051760 | 0.033126 | 3 | 4 | 18 | 1.500000 | 0 | 0 | 0 | 0 | 4 | 2 | 6 | 0.000000 | 0.0 | 0.012070 | 0.000000 | 0.110863 | 0.049397 | 0.160259 | 138 | 395 | 222 | 26 | 27 | 118 | 122 | 88 | 0 | 8 |
17419 rows × 52 columns
array([0, 1, 1, ..., 0, 1, 0], dtype=int64)
Feature Scaling
Apply normalization to the features to keep them on the same scale.
# feature scaling
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
scaled_X_train = normalizer.fit_transform(X_train)
scaled_X_test = normalizer.transform(X_test)
display(scaled_X_train)
scaled_X_test
array([[0.10502283, 0.140625 , 0.52652083, ..., 0.91666667, 0. ,
1. ],
[0.03196347, 0.03125 , 0.74592234, ..., 0.97222222, 0. ,
1. ],
[0.06392694, 0.09375 , 0.57686545, ..., 0.94444444, 0. ,
1. ],
...,
[0.07762557, 0.109375 , 0.53834992, ..., 0.91666667, 0. ,
1. ],
[0.10502283, 0.140625 , 0.53700929, ..., 0.02777778, 0. ,
1. ],
[0.09589041, 0.0625 , 0.76785946, ..., 0.97222222, 0. ,
1. ]])
array([[0.0456621 , 0.046875 , 0.79466708, ..., 0.97222222, 0. ,
1. ],
[0.0913242 , 0.125 , 0.54084653, ..., 0.91666667, 0. ,
1. ],
[0.08675799, 0.109375 , 0.59334732, ..., 0.94444444, 0. ,
1. ],
...,
[0.08675799, 0.0625 , 0.72849865, ..., 0.97222222, 0. ,
1. ],
[0.10502283, 0.140625 , 0.52352412, ..., 0.05555556, 0. ,
1. ],
[0.08675799, 0.0625 , 0.73058654, ..., 0.97222222, 0. ,
1. ]])
Define Custom Functions to Evaluate the Models
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score
from sklearn.metrics import precision_score, f1_score, ConfusionMatrixDisplay
from sklearn.metrics import classification_report
def evaluate_model(y_val, y_pred):
"""Function to evaluate model and return the metric of the model
It returns a dictionary with the classification metrics.
"""
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')
result = {"accuracy_score": accuracy,
"precision_score": precision,
"recall_score": recall,
"f1_score": f1}
return result
def plot_confusion_matrix(y_val, y_pred, label):
'''function to plot confusion matrix
Args
y_val: array. The validation set of the target variable.
y_pred: array. Model's prediction.
label: list. A list containing all the classes in the target variable
Returns
It returns a plot of the confusion matrix
'''
cm = confusion_matrix(y_val, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
ConfusionMatrixDisplay(cm, display_labels=label).plot(ax=ax, values_format='', xticks_rotation='vertical')
# class labels
label = ['Benign', 'Malicious']
Training Logistic Regression Model
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(scaled_X_train, y_train)
# Evaluate model on the test set
lr_y_pred = lr_model.predict(scaled_X_test)
lr_test_result = evaluate_model(y_test, lr_y_pred)
print(lr_test_result)
print(classification_report(y_test, lr_y_pred))
plot_confusion_matrix(y_test, lr_y_pred, label)
{'accuracy_score': 0.9963832596589931, 'precision_score': 0.9963834376844934, 'recall_score': 0.9963832596589931, 'f1_score': 0.9963832665728144}
precision recall f1-score support
0 1.00 1.00 1.00 8770
1 1.00 1.00 1.00 8649
accuracy 1.00 17419
macro avg 1.00 1.00 1.00 17419
weighted avg 1.00 1.00 1.00 17419
Training a Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=SEED)
rf_model.fit(scaled_X_train, y_train)
# Evaluate model on the test set
rf_y_pred = rf_model.predict(scaled_X_test)
rf_test_result = evaluate_model(y_test, rf_y_pred)
print(rf_test_result)
print(classification_report(y_test, rf_y_pred))
plot_confusion_matrix(y_test, rf_y_pred, label)
{'accuracy_score': 1.0, 'precision_score': 1.0, 'recall_score': 1.0, 'f1_score': 1.0}
precision recall f1-score support
0 1.00 1.00 1.00 8770
1 1.00 1.00 1.00 8649
accuracy 1.00 17419
macro avg 1.00 1.00 1.00 17419
weighted avg 1.00 1.00 1.00 17419
This model is overfitting. I will fine-tune the model using GridSearchCV to select the best hyperparameters for the random forest model.
# Fine-Tune the Model
from sklearn.model_selection import GridSearchCV
rf_model = RandomForestClassifier(random_state=SEED)
# setting different parameter combinations
parameters = [{'criterion': ['gini', 'entropy'],
'n_estimators': np.arange(100,300,100)}
]
# n_jobs=-1 uses all the system's processors
grid_search = GridSearchCV(estimator=rf_model,
param_grid=parameters,
scoring='accuracy',
cv=5,
n_jobs=-1)
grid_search.fit(scaled_X_train, y_train)
# get the best accuracy score
best_acc = grid_search.best_score_
# get the parameters that gave the best score
best_parameters = grid_search.best_params_
rf_score = best_acc * 100
print("Best accuracy score {:.2f} %".format(rf_score))
print("Best Parameters:", best_parameters)
# extract the best random forest model
best_rf_model = grid_search.best_estimator_
# Evaluate model on the test set
rf_y_pred = best_rf_model.predict(scaled_X_test)
rf_test_result = evaluate_model(y_test, rf_y_pred)
print(rf_test_result)
print(classification_report(y_test, rf_y_pred))
plot_confusion_matrix(y_test, rf_y_pred, label)
Best accuracy score 99.99 %
Best Parameters: {'criterion': 'gini', 'n_estimators': 100}
{'accuracy_score': 1.0, 'precision_score': 1.0, 'recall_score': 1.0, 'f1_score': 1.0}
precision recall f1-score support
0 1.00 1.00 1.00 8770
1 1.00 1.00 1.00 8649
accuracy 1.00 17419
macro avg 1.00 1.00 1.00 17419
weighted avg 1.00 1.00 1.00 17419
Display the Top 10 Important Features
# sort the most important feature based on index
sorted_idx = best_rf_model.feature_importances_.argsort()
fig = plt.figure(figsize=(10, 10))
plt.barh(X_train.columns[sorted_idx][-10:], best_rf_model.feature_importances_[sorted_idx][-10:])
plt.title("Random Forest Feature Importance", fontsize=16)
plt.ylabel("Features", fontsize=14)
plt.xlabel('Importance', fontsize=14);
top_features = list(X_train.columns[sorted_idx][-10:])
top_features.reverse()
print(top_features)
['svcscan.shared_process_services', 'svcscan.nservices', 'svcscan.kernel_drivers', 'handles.avg_handles_per_proc', 'handles.nevent', 'handles.nhandles', 'dlllist.avg_dlls_per_proc', 'handles.nmutant', 'pslist.avg_handlers', 'handles.nkey']
Visualize the Results
# Create list for accuracy, precision, recall and f1-score of each ml algorithm
accuracy_scores = [rf_test_result['accuracy_score'],
lr_test_result['accuracy_score']
]
precision_scores = [rf_test_result['precision_score'],
lr_test_result['precision_score']]
recall_scores = [rf_test_result['recall_score'],
lr_test_result['recall_score']]
f1_scores = [rf_test_result['f1_score'],
lr_test_result['f1_score']]
models = ["Random Forest", "Logistic Regression"]
# 3 is the number of models used
X = np.arange(2)
fig = plt.figure(figsize=(8,6))
ax = fig.add_axes([0,0,1,1])
ax.bar(X + 0.00, accuracy_scores, color = 'darkblue', width = 0.2)
ax.bar(X + 0.20, precision_scores, color = 'orange', width = 0.2)
ax.bar(X + 0.40, recall_scores, color = 'brown', width = 0.2)
ax.bar(X + 0.60, f1_scores, color = 'gray', width = 0.2)
plt.xticks(X+0.30, models)
plt.ylim(0, 1)
plt.xlabel("ML Algorithms", fontsize=14)
plt.ylabel("Scores", fontsize=14)
plt.title('Test Set Result of Machine Learning Algorithms on Malware Detection', fontsize=16)
plt.legend(["Accuracy", "Precision", "Recall", "F1"]);
Drop Correlated Features
# create empty set for correlated features
correlated_features = set()
correlation_matrix = df.corr()
# get features whose correlation is greater than 0.9
for i in range(len(correlation_matrix.columns)):
for j in range(i):
if abs(correlation_matrix.iloc[i, j]) > 0.9:
corr = correlation_matrix.columns[i]
correlated_features.add(corr)
print('There are {} correlated features'.format(len(correlated_features)))
print(correlated_features)
There are 29 correlated features
{'handles.nhandles', 'psxview.not_in_ethread_pool_false_avg', 'ldrmodules.not_in_init', 'malfind.uniqueInjections', 'handles.ntimer', 'dlllist.avg_dlls_per_proc', 'psxview.not_in_pspcid_list_false_avg', 'svcscan.shared_process_services', 'ldrmodules.not_in_init_avg', 'handles.avg_handles_per_proc', 'handles.ndirectory', 'handles.nevent', 'ldrmodules.not_in_mem_avg', 'handles.nthread', 'psxview.not_in_csrss_handles_false_avg', 'psxview.not_in_deskthrd', 'psxview.not_in_session', 'handles.nmutant', 'ldrmodules.not_in_mem', 'psxview.not_in_session_false_avg', 'psxview.not_in_pspcid_list', 'handles.nfile', 'svcscan.fs_drivers', 'ldrmodules.not_in_load', 'psxview.not_in_deskthrd_false_avg', 'psxview.not_in_csrss_handles', 'malfind.protection', 'psxview.not_in_pslist_false_avg', 'handles.nsemaphore'}
print(df2.shape)
df2.drop(correlated_features, axis=1, inplace=True)
# test
df2.shape
(58062, 55)
(58062, 26)
df2.head()
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | handles.ndesktop | handles.nkey | handles.nsection | ldrmodules.not_in_load_avg | malfind.ninjections | malfind.commitCharge | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_eprocess_pool_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | Class | category_new | type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 17 | 10.555556 | 202.844444 | 1694 | 46 | 716 | 184 | 0.030372 | 5 | 21 | 2 | 0 | 3 | 0.0 | 138 | 389 | 221 | 24 | 121 | 87 | 0 | 8 | 0 | Benign | None |
| 1 | 47 | 19 | 11.531915 | 242.234043 | 2074 | 51 | 1011 | 337 | 0.036167 | 12 | 77 | 0 | 0 | 0 | 0.0 | 138 | 392 | 222 | 24 | 122 | 87 | 0 | 8 | 0 | Benign | None |
| 2 | 40 | 14 | 14.725000 | 288.225000 | 1932 | 45 | 784 | 369 | 0.026114 | 5 | 6 | 0 | 0 | 0 | 0.0 | 137 | 395 | 222 | 27 | 120 | 88 | 0 | 8 | 0 | Benign | None |
| 3 | 32 | 13 | 13.500000 | 264.281250 | 1445 | 36 | 654 | 186 | 0.021483 | 2 | 2 | 0 | 0 | 0 | 0.0 | 138 | 395 | 222 | 27 | 120 | 88 | 0 | 8 | 0 | Benign | None |
| 4 | 42 | 16 | 11.452381 | 281.333333 | 2067 | 45 | 1252 | 375 | 0.047820 | 12 | 77 | 4 | 0 | 4 | 0.0 | 138 | 392 | 222 | 24 | 124 | 87 | 0 | 8 | 0 | Benign | None |
df2.columns
Index(['pslist.nproc', 'pslist.nppid', 'pslist.avg_threads',
'pslist.avg_handlers', 'dlllist.ndlls', 'handles.ndesktop',
'handles.nkey', 'handles.nsection', 'ldrmodules.not_in_load_avg',
'malfind.ninjections', 'malfind.commitCharge', 'psxview.not_in_pslist',
'psxview.not_in_eprocess_pool', 'psxview.not_in_ethread_pool',
'psxview.not_in_eprocess_pool_false_avg', 'modules.nmodules',
'svcscan.nservices', 'svcscan.kernel_drivers',
'svcscan.process_services', 'svcscan.nactive', 'callbacks.ncallbacks',
'callbacks.nanonymous', 'callbacks.ngeneric', 'Class', 'category_new',
'type'],
dtype='object')
Separate the features and target variable
The target variable is the Class variable.
X = df2.copy().drop(['category_new', 'type', 'Class'], axis=1)
y = df2.Class.values
display(X)
y
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | handles.ndesktop | handles.nkey | handles.nsection | ldrmodules.not_in_load_avg | malfind.ninjections | malfind.commitCharge | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_eprocess_pool_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | 17 | 10.555556 | 202.844444 | 1694 | 46 | 716 | 184 | 0.030372 | 5 | 21 | 2 | 0 | 3 | 0.0 | 138 | 389 | 221 | 24 | 121 | 87 | 0 | 8 |
| 1 | 47 | 19 | 11.531915 | 242.234043 | 2074 | 51 | 1011 | 337 | 0.036167 | 12 | 77 | 0 | 0 | 0 | 0.0 | 138 | 392 | 222 | 24 | 122 | 87 | 0 | 8 |
| 2 | 40 | 14 | 14.725000 | 288.225000 | 1932 | 45 | 784 | 369 | 0.026114 | 5 | 6 | 0 | 0 | 0 | 0.0 | 137 | 395 | 222 | 27 | 120 | 88 | 0 | 8 |
| 3 | 32 | 13 | 13.500000 | 264.281250 | 1445 | 36 | 654 | 186 | 0.021483 | 2 | 2 | 0 | 0 | 0 | 0.0 | 138 | 395 | 222 | 27 | 120 | 88 | 0 | 8 |
| 4 | 42 | 16 | 11.452381 | 281.333333 | 2067 | 45 | 1252 | 375 | 0.047820 | 12 | 77 | 4 | 0 | 4 | 0.0 | 138 | 392 | 222 | 24 | 124 | 87 | 0 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 58591 | 37 | 15 | 10.108108 | 215.486487 | 1453 | 40 | 668 | 161 | 0.029292 | 3 | 3 | 1 | 0 | 1 | 0.0 | 138 | 389 | 221 | 24 | 120 | 86 | 0 | 8 |
| 58592 | 37 | 14 | 9.945946 | 190.216216 | 1347 | 39 | 555 | 111 | 0.030958 | 3 | 3 | 0 | 0 | 0 | 0.0 | 138 | 389 | 221 | 24 | 116 | 88 | 0 | 8 |
| 58593 | 38 | 15 | 9.842105 | 210.026316 | 1448 | 40 | 673 | 167 | 0.029292 | 3 | 3 | 2 | 0 | 3 | 0.0 | 138 | 389 | 221 | 24 | 120 | 88 | 0 | 8 |
| 58594 | 37 | 15 | 10.243243 | 215.513513 | 1452 | 40 | 668 | 161 | 0.029392 | 3 | 3 | 0 | 0 | 0 | 0.0 | 138 | 389 | 221 | 24 | 120 | 87 | 0 | 8 |
| 58595 | 38 | 15 | 9.868421 | 213.026316 | 1487 | 41 | 683 | 164 | 0.029255 | 3 | 3 | 3 | 0 | 3 | 0.0 | 138 | 389 | 221 | 24 | 120 | 86 | 0 | 8 |
58062 rows × 23 columns
array([0, 0, 0, ..., 1, 1, 1], dtype=int64)
Split Data into the training set and test set
70% of the data will be used for training the models. While 30% will be used to evaluate the models.
from sklearn.model_selection import train_test_split
SEED = 20
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=SEED, stratify=y)
# Check
print("Training set")
print("Training set size", X_train.shape)
display(X_train)
display(y_train)
print("Testing set")
print("Testing set size", X_test.shape)
display(X_test)
display(y_test)
Training set Training set size (40643, 23)
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | handles.ndesktop | handles.nkey | handles.nsection | ldrmodules.not_in_load_avg | malfind.ninjections | malfind.commitCharge | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_eprocess_pool_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 36982 | 44 | 17 | 9.636364 | 200.931818 | 1674 | 47 | 699 | 182 | 0.029582 | 4 | 4 | 0 | 0 | 0 | 0.0 | 138 | 389 | 221 | 24 | 122 | 86 | 0 | 8 |
| 48875 | 28 | 10 | 12.964286 | 260.107143 | 1258 | 31 | 549 | 135 | 0.022551 | 2 | 2 | 0 | 0 | 0 | 0.0 | 138 | 389 | 221 | 24 | 117 | 88 | 0 | 8 |
| 53467 | 35 | 14 | 10.400000 | 215.400000 | 1384 | 37 | 599 | 138 | 0.030561 | 2 | 2 | 2 | 0 | 2 | 0.0 | 138 | 389 | 221 | 24 | 120 | 87 | 0 | 8 |
| 57183 | 39 | 15 | 10.743590 | 217.384615 | 1558 | 42 | 655 | 177 | 0.028696 | 3 | 3 | 3 | 0 | 3 | 0.0 | 138 | 389 | 221 | 24 | 123 | 87 | 0 | 8 |
| 55768 | 37 | 15 | 10.243243 | 215.054054 | 1446 | 40 | 667 | 161 | 0.029472 | 3 | 3 | 0 | 0 | 0 | 0.0 | 138 | 389 | 221 | 24 | 119 | 86 | 0 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2764 | 42 | 12 | 13.316530 | 301.911126 | 2194 | 46 | 941 | 416 | 0.040050 | 6 | 7 | 0 | 0 | 1 | 0.0 | 138 | 395 | 222 | 27 | 124 | 88 | 0 | 8 |
| 22016 | 50 | 18 | 11.985931 | 271.454189 | 2375 | 56 | 1043 | 461 | 0.043621 | 8 | 10 | 2 | 0 | 2 | 0.0 | 138 | 392 | 222 | 24 | 126 | 87 | 0 | 8 |
| 49923 | 38 | 15 | 9.815789 | 209.447368 | 1446 | 40 | 667 | 160 | 0.029372 | 3 | 3 | 3 | 0 | 4 | 0.0 | 138 | 389 | 221 | 24 | 119 | 86 | 0 | 8 |
| 57132 | 44 | 17 | 9.795455 | 200.659091 | 1630 | 47 | 682 | 181 | 0.043152 | 3 | 3 | 0 | 0 | 0 | 0.0 | 138 | 389 | 221 | 24 | 122 | 54 | 0 | 8 |
| 15985 | 42 | 12 | 13.297032 | 302.387573 | 2177 | 46 | 877 | 414 | 0.036532 | 5 | 6 | 1 | 0 | 1 | 0.0 | 138 | 395 | 222 | 27 | 123 | 88 | 0 | 8 |
40643 rows × 23 columns
array([1, 1, 1, ..., 1, 1, 0], dtype=int64)
Testing set Testing set size (17419, 23)
| pslist.nproc | pslist.nppid | pslist.avg_threads | pslist.avg_handlers | dlllist.ndlls | handles.ndesktop | handles.nkey | handles.nsection | ldrmodules.not_in_load_avg | malfind.ninjections | malfind.commitCharge | psxview.not_in_pslist | psxview.not_in_eprocess_pool | psxview.not_in_ethread_pool | psxview.not_in_eprocess_pool_false_avg | modules.nmodules | svcscan.nservices | svcscan.kernel_drivers | svcscan.process_services | svcscan.nactive | callbacks.ncallbacks | callbacks.nanonymous | callbacks.ngeneric | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 21543 | 31 | 11 | 13.703655 | 305.364025 | 1571 | 33 | 757 | 262 | 0.036152 | 3 | 4 | 8 | 0 | 9 | 0.0 | 138 | 395 | 222 | 27 | 123 | 88 | 0 | 8 |
| 58064 | 41 | 16 | 9.853659 | 208.853658 | 1613 | 44 | 675 | 181 | 0.027190 | 3 | 3 | 4 | 0 | 4 | 0.0 | 138 | 389 | 221 | 24 | 123 | 86 | 0 | 8 |
| 47128 | 40 | 15 | 10.650000 | 214.450000 | 1575 | 43 | 665 | 179 | 0.028395 | 3 | 3 | 5 | 0 | 5 | 0.0 | 138 | 389 | 221 | 24 | 123 | 87 | 0 | 8 |
| 16592 | 40 | 12 | 13.308599 | 301.008599 | 2001 | 44 | 795 | 449 | 0.032357 | 3 | 4 | 2 | 0 | 2 | 0.0 | 138 | 395 | 222 | 27 | 123 | 88 | 0 | 8 |
| 2763 | 42 | 17 | 11.608303 | 262.875666 | 1875 | 46 | 909 | 370 | 0.041367 | 4 | 5 | 9 | 0 | 10 | 0.0 | 138 | 392 | 222 | 24 | 126 | 87 | 0 | 8 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 560 | 40 | 12 | 13.725000 | 302.350000 | 2001 | 44 | 795 | 452 | 0.032357 | 3 | 4 | 2 | 0 | 2 | 0.0 | 138 | 395 | 222 | 27 | 123 | 88 | 0 | 8 |
| 22204 | 66 | 18 | 10.938011 | 247.650504 | 2911 | 66 | 1116 | 606 | 0.036042 | 9 | 47 | 0 | 0 | 4 | 0.0 | 138 | 392 | 222 | 24 | 125 | 87 | 0 | 8 |
| 15933 | 40 | 12 | 12.700000 | 292.175287 | 1985 | 44 | 803 | 435 | 0.033126 | 3 | 4 | 0 | 0 | 0 | 0.0 | 138 | 395 | 222 | 27 | 123 | 88 | 0 | 8 |
| 57302 | 44 | 17 | 9.590909 | 196.795455 | 1619 | 46 | 686 | 183 | 0.038341 | 2 | 2 | 1 | 0 | 1 | 0.0 | 138 | 389 | 221 | 24 | 122 | 55 | 0 | 8 |
| 13364 | 40 | 12 | 12.731670 | 288.333983 | 1985 | 44 | 803 | 431 | 0.033126 | 3 | 4 | 0 | 0 | 0 | 0.0 | 138 | 395 | 222 | 27 | 122 | 88 | 0 | 8 |
17419 rows × 23 columns
array([0, 1, 1, ..., 0, 1, 0], dtype=int64)
Feature Scaling
Apply normalization to the features to keep them on the same scale.
# feature scaling
from sklearn.preprocessing import MinMaxScaler
normalizer = MinMaxScaler()
scaled_X_train = normalizer.fit_transform(X_train)
scaled_X_test = normalizer.transform(X_test)
display(scaled_X_train)
scaled_X_test
array([[0.10502283, 0.140625 , 0.52652083, ..., 0.91666667, 0. ,
1. ],
[0.03196347, 0.03125 , 0.74592234, ..., 0.97222222, 0. ,
1. ],
[0.06392694, 0.09375 , 0.57686545, ..., 0.94444444, 0. ,
1. ],
...,
[0.07762557, 0.109375 , 0.53834992, ..., 0.91666667, 0. ,
1. ],
[0.10502283, 0.140625 , 0.53700929, ..., 0.02777778, 0. ,
1. ],
[0.09589041, 0.0625 , 0.76785946, ..., 0.97222222, 0. ,
1. ]])
array([[0.0456621 , 0.046875 , 0.79466708, ..., 0.97222222, 0. ,
1. ],
[0.0913242 , 0.125 , 0.54084653, ..., 0.91666667, 0. ,
1. ],
[0.08675799, 0.109375 , 0.59334732, ..., 0.94444444, 0. ,
1. ],
...,
[0.08675799, 0.0625 , 0.72849865, ..., 0.97222222, 0. ,
1. ],
[0.10502283, 0.140625 , 0.52352412, ..., 0.05555556, 0. ,
1. ],
[0.08675799, 0.0625 , 0.73058654, ..., 0.97222222, 0. ,
1. ]])
Training Logistic Regression Model
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(scaled_X_train, y_train)
# Evaluate model on the test set
lr_y_pred = lr_model.predict(scaled_X_test)
lr_test_result = evaluate_model(y_test, lr_y_pred)
print(lr_test_result)
print(classification_report(y_test, lr_y_pred))
plot_confusion_matrix(y_test, lr_y_pred, label)
{'accuracy_score': 0.9946610023537517, 'precision_score': 0.9946612920977461, 'recall_score': 0.9946610023537517, 'f1_score': 0.9946609865869084}
precision recall f1-score support
0 0.99 1.00 0.99 8770
1 1.00 0.99 0.99 8649
accuracy 0.99 17419
macro avg 0.99 0.99 0.99 17419
weighted avg 0.99 0.99 0.99 17419
Training a Random Forest Model
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=SEED)
rf_model.fit(scaled_X_train, y_train)
# Evaluate model on the test set
rf_y_pred = rf_model.predict(scaled_X_test)
rf_test_result = evaluate_model(y_test, rf_y_pred)
print(rf_test_result)
print(classification_report(y_test, rf_y_pred))
plot_confusion_matrix(y_test, rf_y_pred, label)
{'accuracy_score': 1.0, 'precision_score': 1.0, 'recall_score': 1.0, 'f1_score': 1.0}
precision recall f1-score support
0 1.00 1.00 1.00 8770
1 1.00 1.00 1.00 8649
accuracy 1.00 17419
macro avg 1.00 1.00 1.00 17419
weighted avg 1.00 1.00 1.00 17419
This model is overfitting. I will fine-tune the model using GridSearchCV to select the best hyperparameters for the random forest model.
# Fine-Tune the Model
from sklearn.model_selection import GridSearchCV
rf_model = RandomForestClassifier(random_state=SEED)
# setting different parameter combinations
parameters = [{'criterion': ['gini', 'entropy'],
'n_estimators': np.arange(100,300,100)}
]
# n_jobs=-1 uses all the system's processors
grid_search = GridSearchCV(estimator=rf_model,
param_grid=parameters,
scoring='accuracy',
cv=5,
n_jobs=-1)
grid_search.fit(scaled_X_train, y_train)
# get the best accuracy score
best_acc = grid_search.best_score_
# get the parameters that gave the best score
best_parameters = grid_search.best_params_
rf_score = best_acc * 100
print("Best accuracy score {:.2f} %".format(rf_score))
print("Best Parameters:", best_parameters)
# extract the best random forest model
best_rf_model = grid_search.best_estimator_
# Evaluate model on the test set
rf_y_pred = best_rf_model.predict(scaled_X_test)
rf_test_result = evaluate_model(y_test, rf_y_pred)
print(rf_test_result)
print(classification_report(y_test, rf_y_pred))
plot_confusion_matrix(y_test, rf_y_pred, label)
Best accuracy score 99.99 %
Best Parameters: {'criterion': 'gini', 'n_estimators': 100}
{'accuracy_score': 1.0, 'precision_score': 1.0, 'recall_score': 1.0, 'f1_score': 1.0}
precision recall f1-score support
0 1.00 1.00 1.00 8770
1 1.00 1.00 1.00 8649
accuracy 1.00 17419
macro avg 1.00 1.00 1.00 17419
weighted avg 1.00 1.00 1.00 17419
Display the Top 10 Important Features
# sort the most important feature based on index
sorted_idx = best_rf_model.feature_importances_.argsort()
fig = plt.figure(figsize=(10, 10))
plt.barh(X_train.columns[sorted_idx][-10:], best_rf_model.feature_importances_[sorted_idx][-10:])
plt.title("Random Forest Feature Importance", fontsize=16)
plt.ylabel("Features", fontsize=14)
plt.xlabel('Importance', fontsize=14);
top_features = list(X_train.columns[sorted_idx][-10:])
top_features.reverse()
print(top_features)
['handles.nkey', 'pslist.avg_handlers', 'svcscan.nservices', 'handles.nsection', 'svcscan.kernel_drivers', 'dlllist.ndlls', 'svcscan.nactive', 'pslist.avg_threads', 'svcscan.process_services', 'ldrmodules.not_in_load_avg']
Visualize the Results
# Create list for accuracy, precision, recall and f1-score of each ml algorithm
accuracy_scores = [rf_test_result['accuracy_score'],
lr_test_result['accuracy_score']
]
precision_scores = [rf_test_result['precision_score'],
lr_test_result['precision_score']]
recall_scores = [rf_test_result['recall_score'],
lr_test_result['recall_score']]
f1_scores = [rf_test_result['f1_score'],
lr_test_result['f1_score']]
models = ["Random Forest", "Logistic Regression"]
# 3 is the number of models used
X = np.arange(2)
fig = plt.figure(figsize=(8,6))
ax = fig.add_axes([0,0,1,1])
ax.bar(X + 0.00, accuracy_scores, color = 'darkblue', width = 0.2)
ax.bar(X + 0.20, precision_scores, color = 'orange', width = 0.2)
ax.bar(X + 0.40, recall_scores, color = 'brown', width = 0.2)
ax.bar(X + 0.60, f1_scores, color = 'gray', width = 0.2)
plt.xticks(X+0.30, models)
plt.ylim(0, 1)
plt.xlabel("ML Algorithms", fontsize=14)
plt.ylabel("Scores", fontsize=14)
plt.title('Test Set Result of Machine Learning Algorithms on Malware Detection', fontsize=16)
plt.legend(["Accuracy", "Precision", "Recall", "F1"]);